-
-
Notifications
You must be signed in to change notification settings - Fork 53
/
server.py
572 lines (524 loc) · 27.3 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
import argparse
import uuid
import json
import socket
import os
from bson import json_util
try:
from urllib import quote # Python 2.X
except ImportError:
from urllib.parse import quote # Python 3+
import re
import shutil
import string
import base64
from datetime import datetime
# non standards, in requirements.txt
from flask import Flask, request, Markup, render_template, redirect, url_for, send_from_directory, session
from flask_session import Session
from flask_gzip import Gzip
import github
from authlib.flask.client import OAuth
# use loginpass to make OAuth connection simpler
from loginpass import create_flask_blueprint, GitHub
def handle_authorize(remote, token, user_info):
session['user'] = {
'user': user_info,
'token': token
}
return redirect('/myrepo')
def clean_github_repository(repo):
"""
get the username/repository from a Github url
:param repo:str the Github url of the repository
:return: username/repository
"""
if repo is None:
return None
repo = repo.replace("http://github.com/", "") \
.replace("https://github.com/", "")
if repo[-1] == '/':
repo = repo[:-1]
split_repo = repo.split("/")
(username, repository) = split_repo[0:2]
branch = None
if len(split_repo) > 2:
if split_repo[2] == "tree":
branch = split_repo[3]
return username, repository, branch
TEXT_CHARACTERS = ''.join([chr(code) for code in range(32,127)] + list('\b\f\n\r\t'))
def istext(s, threshold=0.30):
if type(s) != str:
s = s.decode('utf8')
# if s contains any null, it's not text:
if '\x00' in s:
return False
# an "empty" string is "text" (arbitrary but reasonable choice):
if not s:
return True
binary_length = 0
try:
binary_length = float(len(s.translate(None, TEXT_CHARACTERS)))
except TypeError:
translate_table = dict((ord(char), None) for char in TEXT_CHARACTERS)
binary_length = float(len(s.translate(str.maketrans(translate_table))))
# s is 'text' if less than 30% of its characters are non-text ones:
return binary_length/len(s) <= threshold
class Anonymous_Github:
def __init__(self,
github_token,
host="127.0.0.1",
port=5000,
config_dir='./repositories',
secret_key=None,
client_id=None,
client_secret=None):
self.github_token = github_token if github_token != "" else os.environ["GITHUB_AUTH_TOKEN"]
self.secret_key = secret_key if secret_key != "" else os.environ["SECRET_KEY"]
self.client_id = client_id if client_id != "" else os.environ["GITHUB_CLIENT_ID"]
self.client_secret = client_secret if client_secret != "" else os.environ["GITHUB_CLIENT_SECRET"]
self.host = host
self.port = port
self.config_dir = config_dir
if config_dir[0:2] == "./":
self.config_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_dir[2:])
if not os.path.exists(self.config_dir):
os.makedirs(self.config_dir)
self.application = self.create_flask_application()
self.set_public_url()
self.github = github.Github(login_or_token=self.github_token)
def set_public_url(self):
if self.host == "0.0.0.0":
self.public_url = "http://" + socket.getfqdn() + ":" + str(self.port)
else:
self.public_url = self.host
if self.public_url[-1] == '/':
self.public_url = self.public_url[0:-1]
def create_flask_application(self):
application = Flask(__name__)
gzip = Gzip(application)
application.log = {}
application.killurl = str(uuid.uuid4())
application.jinja_env.add_extension('jinja2.ext.do')
application.config.update(
SESSION_TYPE='filesystem',
PERMANENT_SESSION_LIFETIME=60*15, # 15 min
SECRET_KEY=self.secret_key,
GITHUB_CLIENT_ID=self.client_id,
GITHUB_CLIENT_SECRET=self.client_secret,
GITHUB_CLIENT_KWARGS = {
'scope': 'repo'
}
)
Session(application)
oauth = OAuth(application)
github_bp = create_flask_blueprint(GitHub, oauth, handle_authorize)
application.register_blueprint(github_bp, url_prefix='/github')
@application.template_filter('remove_terms', )
def remove_terms(content, repository_configuration, word_boundaries=True, whole_urls=True):
"""
remove the blacklisted terms from the content
:param content: the content to anonymize
:param repository_configuration: the configuration of the repository
:return: the anonymized content
"""
repo = repository_configuration['repository']
if repo[-1] == '/':
repo = repo[0:-1]
content = re.compile("%s/blob/master" % repo, re.IGNORECASE).sub(
"%s/repository/%s" % (self.public_url, repository_configuration["id"]), content)
content = re.compile(repo, re.IGNORECASE).sub("%s/repository/%s" % (self.public_url, repository_configuration["id"]), content)
for term in repository_configuration['terms']:
if word_boundaries:
regex = re.compile(r'\b%s\b' % term, re.IGNORECASE)
else:
regex = re.compile(term, re.IGNORECASE)
if whole_urls:
def sub_url(m):
if regex.search(m.group(0)):
return 'XXX'
return m.group(0)
url_regex = re.compile('\\b((https?|ftp|file)://)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\\b')
content = url_regex.sub(sub_url, content)
content = regex.sub("XXX", content)
return content
@application.template_filter('file_render', )
def file_render(file, repository_configuration):
"""
produce the html representation of a file
:param file: the file to display
:param repository_configuration: the configuration of the repository
:return: the html representation of the file
"""
if type(file) == github.Commit.Commit:
return Markup(remove_terms(render_template('patch.html', patch=file), repository_configuration))
if file.type == 'dir':
return ""
if file.size > 1000000:
return Markup("The file %s is too big to be anonymized (beyond 1MB, Github limit)" % (file.name))
if ".md" in file.name or file.name == file.name.upper() or "changelog" == file.name.lower():
gh = self.github
if 'token' in repository_configuration and repository_configuration['token'] is not None:
gh = github.Github(repository_configuration['token'])
return Markup("<div class='markdown-body'>%s</div>" % remove_terms(
gh.render_markdown(file.decoded_content.decode('utf-8')),
repository_configuration))
if ".jpg" in file.name or ".png" in file.name or ".png" in file.name or ".gif" in file.name:
index = file.name.index('.')
file_extension = file.name[index + 1:]
return Markup("<img src='data:image/%s;base64, %s' alt='%s'>" % (file_extension, file.content, file.name))
if istext(file.decoded_content):
return Markup("<pre><code>{}</code></pre>")\
.format(Markup.escape(remove_terms(file.decoded_content.decode("utf-8"), repository_configuration)))
return Markup("<b>%s has an unknown extension, we are unable to anonymize it (known extensions md/txt/json/java/...)</b>" % (file.name))
@application.route('/' + application.killurl, methods=['POST'])
def seriouslykill():
func = request.environ.get('werkzeug.server.shutdown')
func()
return "Shutting down..."
def get_element_from_path(g_repo, g_commit, path):
"""
get a github element from its path
:param g_repo: the github repository
:param path: the path of the element
:return: the element
"""
if path == '':
return g_repo.get_contents('', g_commit.sha), None
current_element = os.path.basename(path)
folder_content = g_repo.get_contents(quote(os.path.dirname(path)), g_commit.sha)
for file in folder_content:
if file.name == current_element:
return file, folder_content
return None, folder_content
@application.route('/myrepo', methods=['GET'])
def myrepo():
user = session.get('user', None)
if user is None or 'token' not in user or user['token'] is None:
return redirect('github/login')
g = github.Github(user['token']['access_token'])
repos = g.get_user().get_repos(sort="full_name")
for repo in repos:
repo.uuid = str(uuid.uuid4())
return render_template('newrepo.html', repos=repos)
@application.route('/repository/<id>/commit/<sha>', methods=['GET'])
def commit(id, sha):
"""
display anonymously a commit from the repository
:param id: the repository id
:param sha: the commit id
"""
config_path = self.config_dir + "/" + str(id) + "/config.json"
if not os.path.exists(config_path):
return render_template('404.html'), 404
with open(config_path) as f:
data = json.load(f, object_hook=json_util.object_hook)
(username, repo, branch) = clean_github_repository(data['repository'])
gh = self.github
if 'token' in data:
gh = github.Github(data['token'])
g_repo = gh.get_repo("%s/%s" % (username, repo))
commit = g_repo.get_commit(sha)
return render_template('repo.html',
repository=data,
current_repository=id,
current_file=commit,
files=[],
path=[])
def is_up_to_date(repository_config, g_commit):
"""
check is the cache is up to date
:param repository_config: the repository configuration
:param g_commit: the Github commit
:return: True if the cache is up to date
"""
commit_date = datetime.strptime(g_commit.last_modified, "%a, %d %b %Y %H:%M:%S %Z")
return 'pushed_at' in repository_config and commit_date.strftime("%s") == repository_config["pushed_at"]
def get_type_content(file_name, path, repository_configuration, g_repo, is_website):
"""
Get the content type of a file from its extension
:param file_name: the filename
:param path: the path of the file
:param repository_configuration: the repository configuration
:param g_repo: the Github repository
:return: the content type
"""
if is_website:
content_type = 'text/plain; charset=utf-8'
if ".html" in file_name:
content_type = 'text/html; charset=utf-8'
if ".md" in file_name or file_name == file_name.upper():
content_type = 'text/html; charset=utf-8'
if ".jpg" in file_name \
or ".png" in file_name \
or ".gif" in file_name:
content_type = 'image/jpeg'
if ".png" in file_name:
content_type = 'image/png'
elif ".gif" in file_name:
content_type = 'image/gif'
if ".txt" in file_name \
or ".log" in file_name \
or ".csv" in file_name \
or ".xml" in file_name \
or ".json" in file_name \
or ".java" in file_name \
or ".py" in file_name \
or ".lua" in file_name \
or ".js" in file_name:
content_type = 'text/plain; charset=utf-8'
if ".xml" in file_name:
content_type = 'application/xml; charset=utf-8'
elif ".json" in file_name:
content_type = 'application/json; charset=utf-8'
elif ".js" in file_name:
content_type = 'application/javascript; charset=utf-8'
if ".css" in file_name:
content_type = 'text/css; charset=utf-8'
return content_type
return 'text/html; charset=utf-8'
def get_content(current_file, files, path, repository_config, g_repo):
"""
get the content if the page
:param current_file: the current file
:param files: the list of file of the current directory
:param path: the accessed path
:param repository_config: the repository configuration
:param g_repo: the Github repository
:return: the content of the page
"""
cache_path = os.path.join(self.config_dir, repository_config['id'], "cache")
file_path = path
if current_file is not None:
if current_file.type == 'dir':
file_path = os.path.join(current_file.path, "index.html")
else:
file_path = current_file.path
cached_file_path = os.path.join(cache_path, file_path)
content_type = get_type_content(path, path, repository_config, g_repo, False).replace("; charset=utf-8", "")
if os.path.exists(cached_file_path):
return send_from_directory(os.path.dirname(cached_file_path), os.path.basename(cached_file_path),
mimetype=content_type)
content = ''
if current_file.type != 'dir' and is_website(path, repository_config, g_repo):
if current_file.size > 1000000:
blob = g_repo.get_git_blob(current_file.sha)
if blob.encoding == 'base64':
content = base64.b64decode(blob.content).decode('utf-8')
else:
content = blob.content.decode('utf-8')
else:
content = current_file.decoded_content.decode('utf-8')
if "text" in content_type:
content = remove_terms(content, repository_config)
if ".md" in current_file.name:
gh = self.github
if 'token' in repository_config:
gh = github.Github(repository_config['token'])
content = remove_terms(gh.render_markdown(content), repository_config)
else:
tree = files
if type(tree) != list:
tree = files.tree
content = render_template('repo.html',
repository=repository_config,
current_repository=repository_config['id'],
current_file=current_file,
files=tree,
path_directory=path if type(
current_file) is not github.ContentFile.ContentFile or current_file.type == 'dir' else os.path.dirname(
current_file.path),
path=path.split("/") if path != '' else [])
content_cache_path = cached_file_path
if not os.path.exists(os.path.dirname(content_cache_path)):
os.makedirs(os.path.dirname(content_cache_path))
with open(content_cache_path, 'w') as f:
if type(content) == str:
f.write(content)
else:
f.write(content.encode('utf8'))
return content
def is_website(path, repository_config, g_repo):
"""
Check if the current request is a request to a GitHub pages
:param path: the current path
:param repository_config: the repository configuration
:param g_repo: the Github repository
:return: True if the current path is a website
"""
return path[:4] == "docs"
def is_default_file(f):
default_name = ["readme", "index"]
for name in default_name:
try:
if type(f) is github.ContentFile.ContentFile:
f.name.lower().index(name)
elif type(f) is github.GitTreeElement.GitTreeElement:
f.path.lower().index(name)
return True
except ValueError:
continue
return False
def get_current_folder_files(path, current_file, repository_config, g_repo, g_commit):
"""
get the list of files of the current repository
:param path: the path to the current file
:param current_file: the current file
:param repository_config: the repository configuration
:param g_repo: the GitHub repository
:return: the list of file of the current repository
"""
files = []
if current_file is None:
return files, current_file
if type(current_file) is not github.ContentFile.ContentFile:
files = g_repo.get_git_tree(g_commit.sha)
for f in current_file:
if is_default_file(f):
current_file = f
break
if type(current_file) is not github.ContentFile.ContentFile:
current_file = current_file[0]
elif current_file.type == 'file':
if os.path.dirname(path) == '':
files = g_repo.get_git_tree(g_commit.sha)
else:
f, folder = get_element_from_path(g_repo, g_commit, os.path.dirname(path))
if f is None:
files = folder
else:
files = g_repo.get_git_tree(f.sha)
else:
files = g_repo.get_git_tree(current_file.sha)
for f in files.tree:
if is_default_file(f):
current_file, folder = get_element_from_path(g_repo, g_commit, os.path.join(path, f.path))
break
if len(files.tree) == 1 and type(files.tree[0]) is github.ContentFile.ContentFile:
current_file, folder = get_element_from_path(g_repo, g_commit, os.path.join(path, files.tree[0].path))
return files, current_file
@application.route('/repository/<id>', methods=['GET'], defaults={'path': ''})
@application.route('/repository/<id>/', methods=['GET'], defaults={'path': ''})
@application.route('/repository/<id>/<path:path>', methods=['GET'])
@application.route('/r/<id>', methods=['GET'], defaults={'path': ''})
@application.route('/r/<id>/', methods=['GET'], defaults={'path': ''})
@application.route('/r/<id>/<path:path>', methods=['GET'])
def repository(id, path):
repo_path = self.config_dir + "/" + str(id)
config_path = repo_path + "/config.json"
if not os.path.exists(config_path):
return render_template('404.html'), 404
with open(config_path, 'r') as f:
repository_configuration = json.load(f, object_hook=json_util.object_hook)
if 'expiration_date' in repository_configuration and repository_configuration['expiration_date'] is not None:
if repository_configuration['expiration_date'] <= datetime.now(repository_configuration['expiration_date'].tzinfo):
if repository_configuration['expiration'] == 'redirect':
return redirect(repository_configuration['repository'])
elif repository_configuration['expiration'] == 'remove':
return render_template('404.html'), 404
(username, repo, branch) = clean_github_repository(repository_configuration['repository'])
gh = self.github
if 'token' in repository_configuration and repository_configuration['token'] is not None:
gh = github.Github(repository_configuration['token'])
g_commit = None
try:
g_repo = gh.get_repo("%s/%s" % (username, repo))
if branch is None:
branch = g_repo.default_branch
g_commit = g_repo.get_commit(branch)
except:
return render_template('empty.html'), 404
if not is_up_to_date(repository_configuration, g_commit):
if os.path.exists(os.path.join(repo_path, "cache")):
shutil.rmtree(os.path.join(repo_path, "cache"))
commit_date = datetime.strptime(g_commit.last_modified, "%a, %d %b %Y %H:%M:%S %Z")
repository_configuration["pushed_at"] = commit_date.strftime("%s")
with open(config_path, 'w') as fa:
json.dump(repository_configuration, fa, default=json_util.default)
cache_path = os.path.join(self.config_dir, id, "cache")
if os.path.isfile(os.path.join(cache_path, path)):
return send_from_directory(os.path.dirname(os.path.join(cache_path, path)),
os.path.basename(os.path.join(cache_path, path)),
mimetype=get_type_content(path, path, repository_configuration, g_repo, is_website(path, repository_configuration, g_repo)).replace("; charset=utf-8", ""))
elif os.path.exists(os.path.join(cache_path, path, "index.html")):
return send_from_directory(os.path.join(cache_path, path), "index.html", mimetype='text/html')
elif os.path.exists(os.path.join(cache_path, path, "README.md")):
return send_from_directory(os.path.join(cache_path, path), "README.md", mimetype='text/html')
clean_path = path
if len(clean_path) > 0 and clean_path[-1] == '/':
clean_path = clean_path[0:-1]
current_file, files = get_element_from_path(g_repo, g_commit, clean_path)
if current_file is None:
return render_template('404.html'), 404
if type(current_file) == github.ContentFile.ContentFile and current_file.type == 'dir' and len(path) > 0 and path[-1] != '/':
return redirect(url_for('repository', id=id, path=path + '/'))
files, current_file = get_current_folder_files(clean_path, current_file, repository_configuration, g_repo, g_commit)
content = get_content(current_file, files, clean_path, repository_configuration, g_repo)
content_type = get_type_content(current_file.name, clean_path, repository_configuration, g_repo, False)
return content, {'Content-Type': content_type}
@application.route('/', methods=['GET'])
def index():
id = request.args.get('id', None)
repo_name = clean_github_repository(request.args.get('githubRepository', None))
repo = None
if id is not None:
config_path = self.config_dir + "/" + id + "/config.json"
if os.path.exists(config_path):
with open(config_path) as f:
data = json.load(f, object_hook=json_util.object_hook)
if repo_name == clean_github_repository(data['repository']):
repo = data
return render_template('index.html', repo=repo)
@application.route('/robots.txt')
def robots():
return application.send_static_file('robots.txt')
@application.route('/', methods=['POST'])
def add_repository():
id = request.args.get('id', str(uuid.uuid4()))
config_path = os.path.join(self.config_dir, str(id))
repo = request.form['githubRepository']
terms = request.form['terms']
expiration_date = None
expiration = None
if 'expiration' in request.form:
expiration = request.form['expiration']
if 'expiration_date' in request.form and request.form['expiration_date'] != '':
expiration_date = datetime.strptime(request.form['expiration_date'], '%Y-%m-%d')
user = session.get('user', None)
token = None
if os.path.exists(config_path):
with open(os.path.join(config_path, "config.json"), 'r') as r:
data = json.load(r)
if 'token' in data:
token = data['token']
if token is None and user is not None and 'token' in user and user['token'] is not None:
token = user['token']['access_token']
if not os.path.exists(config_path):
os.mkdir(config_path)
with open(config_path + "/config.json", 'w') as outfile:
json.dump({
"id": id,
"repository": repo,
"terms": terms.splitlines(),
"token": token,
"expiration_date": expiration_date,
"expiration": expiration
}, outfile, default=json_util.default)
return redirect(url_for('repository', id=id))
return application
def run(self, **keywords):
self.application.run(host="127.0.0.1", port=self.port, **keywords)
def initParser():
parser = argparse.ArgumentParser(description='Start Anonymous Github')
parser.add_argument('-token', required=True, help='GitHub token')
parser.add_argument('-secret', required=True, help='App secret')
parser.add_argument('-client_id', required=True, help='GitHub aouth client id')
parser.add_argument('-client_secret', required=True, help='GitHub aouth client secret')
parser.add_argument('-host', help='The hostname', default="127.0.0.1")
parser.add_argument('-port', help='The port of the application', default=5000)
parser.add_argument('-config_dir', help='The repository that will contains the configuration files',
default='./repositories')
return parser.parse_args()
if __name__ == "__main__":
args = initParser()
Anonymous_Github(github_token=args.token, host=args.host, port=args.port, config_dir=args.config_dir, secret_key=args.secret, client_id=args.client_id, client_secret=args.client_secret).run()