def add_repo(project_id): data = request.get_json() project = Project.query.get(project_id) if project is None: return jsonify(message="Project not found"), 404 if not project.course.is_user(current_user.id): return jsonify(message="Unauthorized to add project"), 403 repo = project.get_repo_for_user(current_user.id) if repo: return jsonify(message="Already added repo for project"), 400 repo = Repo(project_id=project.id, name=data['repo_name']) repo.user_id = current_user.id if project.type is not ProjectTypes.IND: g = project.get_group_for_user(current_user.id) if g is None: return jsonify(message="Not part of group"), 400 repo.group_id = g.id webhook = add_webhook(repo.name, current_user.oauth_token) if 'id' not in webhook: return jsonify(error="Could not create webhook", data=webhook), 401 repo.webhook_id = webhook['id'] db.session.add(repo) db.session.commit() return jsonify(repo=ps.RepoSchema().dump(repo)), 201
def test_add_topics(self): aggregator = Aggregator() aggregator.add(Repo(False, 10, "python", ["Flask"])) self.assertEqual(aggregator.topics, {"flask": 1}) aggregator.add(Repo(False, 10, "python", ["flask"])) self.assertEqual(aggregator.topics, {"flask": 2}) aggregator.add(Repo(False, 10, "python", ["flask", "numpy"])) self.assertEqual(aggregator.topics, {"flask": 3, "numpy": 1})
def test_add_languages(self): aggregator = Aggregator() aggregator.add(Repo(False, 10, "Python", ["Flask"])) self.assertEqual(aggregator.languages, {"python": 1}) aggregator.add(Repo(False, 10, "python", ["Flask"])) self.assertEqual(aggregator.languages, {"python": 2}) aggregator.add(Repo(False, 10, "java", ["Flask"])) self.assertEqual(aggregator.languages, {"python": 2, "java": 1}) aggregator.add(Repo(False, 10, None, ["Flask"])) self.assertEqual(aggregator.languages, {"python": 2, "java": 1})
def create(self, request, *args, **kwargs): """ Generates db repo object """ user = request.data['user'] path = request.data['path'] version = request.data['version'] res = RepoManager(user).create() if not version: version = res r = Repo(user=user, path=path, version=version) r.save() rs = RepoSerializer(r) return Response(rs.data)
def test_asdict(self): aggregator = Aggregator() expected = { "public_repo_count": 0, "public_repo_breakdown": { "original_repo_count": 0, "forked_repo_count": 0 }, "watcher_count": 0, "languages": [], "topics": [] } self.assertEqual(aggregator.asdict(), expected) expected2 = { "public_repo_count": 1, "public_repo_breakdown": { "original_repo_count": 1, "forked_repo_count": 0 }, "watcher_count": 10, "languages": [{ "name": "python", "count": 1 }], "topics": [{ "name": "flask", "count": 1 }] } aggregator.add(Repo(True, 10, "python", ["flask"])) self.assertEqual(aggregator.asdict(), expected2)
def set_random_day_for_monthy_job(): for repo in Repo.select().where((Repo.random_job_day == None)): repo.random_job_day = random.randint(1, 28) task_logger.info( f"set random day for monthly job of repo '{repo.name}' at '{repo.random_job_day}'" ) repo.save()
def __init__(self, sample_dict=None, class_names=None): global _sample if self.feature_name is None: raise Exception('Provide feature_name field in subclass.') if sample_dict is None: if _sample is None: _sample = Repo.load_sample(separate=True) sample_dict = _sample if class_names is None: class_names = sorted(classes) self.class_names = class_names self.figure, self.ax = plt.subplots() hist_data = [] for clsname in class_names: # ie 'class feature data' cfd = np.array([getattr(repo, self.feature_name) for repo in sample_dict[clsname]]) if self.in_class_percentiles is not None: min_val, max_val = [np.percentile(cfd, i) for i in self.in_class_percentiles] cfd = np.array([e for e in cfd if min_val < e < max_val]) hist_data.append(cfd) self.hist_data = hist_data
def get(self, username, reponame): try: repo = (Repo.select().join(User).alias("user").where( (User.name == username) & (Repo.name == reponame)).get()) title = repo.user.name + "/" + repo.name timemap = self.get_query_argument("timemap", "false") == "true" datetime = self.get_query_argument("datetime", None) key = self.get_query_argument("key", None) if key and not timemap: self.render("repo/memento.html", repo=repo, key=key, datetime=datetime) elif key and timemap: self.render("repo/history.html", repo=repo, key=key) else: cs = (CSet.select(fn.distinct( CSet.hkey)).where(CSet.repo == repo).limit(5).alias("cs")) samples = (HMap.select(HMap.val).join( cs, on=(HMap.sha == cs.c.hkey_id))) self.render("repo/show.html", title=title, repo=repo, samples=list(samples)) except Repo.DoesNotExist: raise HTTPError(404)
def get(self, username, reponame): try: repo = (Repo.select().join(User).alias("user") .where((User.name == username) & (Repo.name == reponame)) .get()) title = repo.user.name + "/" + repo.name timemap = self.get_query_argument("timemap", "false") == "true" datetime = self.get_query_argument("datetime", None) key = self.get_query_argument("key", None) if key and not timemap: self.render("repo/memento.html", repo=repo, key=key, datetime=datetime) elif key and timemap: self.render("repo/history.html", repo=repo, key=key) else: cs = (CSet.select(fn.distinct(CSet.hkey)) .where(CSet.repo == repo).limit(5).alias("cs")) samples = (HMap.select(HMap.val) .join(cs, on=(HMap.sha == cs.c.hkey_id))) self.render("repo/show.html", title=title, repo=repo, samples=list(samples)) except Repo.DoesNotExist: raise HTTPError(404)
def get(self, username, reponame): try: repo = (Repo.select().join(User).alias("user") .where((User.name == username) & (Repo.name == reponame)) .get()) title = repo.user.name + "/" + repo.name timemap = self.get_query_argument("timemap", "false") == "true" datetime = self.get_query_argument("datetime", None) key = self.get_query_argument("key", None) index = self.get_query_argument("index", "false") == "true" if self.get_query_argument("datetime", None): datestr = self.get_query_argument("datetime") try: ts = date(datestr, QSDATEFMT) except ValueError: raise HTTPError(reason="Invalid format of datetime param", status_code=400) elif "Accept-Datetime" in self.request.headers: datestr = self.request.headers.get("Accept-Datetime") ts = date(datestr, RFC1123DATEFMT) else: ts = now() if key and not timemap: chain = revision_logic.get_chain_at_ts(repo, key, ts) # use ts of cset instead of now(), to make prev work if len(chain) != 0: ts = chain[-1].time cs_prev = revision_logic.get_cset_prev_before_ts(repo, key, ts) cs_next = revision_logic.get_cset_next_after_ts(repo, key, ts) if cs_prev: cs_prev_str = self.request.protocol + "://" + self.request.host + self.request.path + "?key=" + key + "&datetime=" + cs_prev.time.strftime(QSDATEFMT) else: cs_prev_str = "" if cs_next: cs_next_str = self.request.protocol + "://" + self.request.host + self.request.path + "?key=" + key + "&datetime=" + cs_next.time.strftime(QSDATEFMT) else: cs_next_str = "" commit_message = revision_logic.get_commit_message(repo, key, ts) self.render("repo/memento.html", repo=repo, key=key, datetime=datetime, cs_next_str=cs_next_str, cs_prev_str=cs_prev_str, commit_message=commit_message) elif key and timemap: self.render("repo/history.html", repo=repo, key=key) elif index: cs = (CSet.select(fn.distinct(CSet.hkey)).where((CSet.repo == repo) & (CSet.time <= ts)).alias("cs")) key_count = (HMap.select(HMap.val).join(cs, on=(HMap.sha == cs.c.hkey_id))).count() page = int(self.get_query_argument("page", "1")) hm = revision_logic.get_repo_index(repo, ts, page) self.render("repo/index.html", repo=repo, title=title, key_count=key_count, page_size=revision_logic.INDEX_PAGE_SIZE, hm=hm, current_page=page) else: hm = list(revision_logic.get_repo_index(repo, ts, 1, 5)) # cs = (CSet.select(fn.distinct(CSet.hkey)).where(CSet.repo == repo).limit(5).alias("cs")) # samples = (HMap.select(HMap.val).join(cs, on=(HMap.sha == cs.c.hkey_id))) self.render("repo/show.html", title=title, repo=repo, hm=hm) except Repo.DoesNotExist: raise HTTPError(reason="Repo not found.", status_code=404)
async def html_app(request, app_name): app = Repo.select().where(Repo.name == app_name) if app.count == 0: raise NotFound() return {"app": app[0], 'relative_path_to_root': '../../', 'path': request.path}
async def launch_monthly_job(): today = date.today().day for repo in Repo.select().where(Repo.random_job_day == today): task_logger.info( f"Launch monthly job for {repo.name} on day {today} of the month ") await create_job(repo.name, repo.url)
def test_repo_commits(self): repo = Repo.create(**self.TEST_REPO) repo.save() response = self.fetch(self.get_app().reverse_url('view', repo.id)) self.assertIn(self.MESSAGES['no_records'], response.body.decode()) self.assertIn(self.MESSAGES['get_more'], response.body.decode()) for commit in range(self.TEST_COUNT): commit_data = self.TEST_COMMIT commit_data.update({'repo': repo}) c = Commit(**commit_data) c.save() response = self.fetch(self.get_app().reverse_url('view', repo.id)) self.assertEqual(response.body.decode().count(self.TEST_COMMIT['message']), self.TEST_COUNT) self.assertIn(self.MESSAGES['get_more'], response.body.decode()) repo.next_page = None repo.save() response = self.fetch(self.get_app().reverse_url('view', repo.id)) self.assertNotIn(self.MESSAGES['get_more'], response.body.decode())
def reconcile_db_with_gh(*args, **kwargs): ghc = GitHubConnector() issues = ghc.get_all_issues() repos = ghc.get_all_repos() for repo in repos: r = Repo(github_id=repo.id, name=repo.name) r.save() for issue in issues: i = Issue(github_id=issue.id) i.title = issue.title i.number = issue.number i.repo = Repo.objects.get(name=issue.repository[1]) i.save() print "Not only did your task run successfully, but you're damned good looking too."
def add_repository(url, comment, user_id): repo = Repo(url=url, timestamp=datetime.datetime.now(), user_id=user_id, comment=comment) db.session.add(repo) db.session.commit() return repo
def get(self, username, reponame): try: repo = Repo.select().join(User).alias("user").where((User.name == username) & (Repo.name == reponame)).get() if not repo.private: self._get(repo) else: self._getAuth(repo) except Repo.DoesNotExist: raise HTTPError(reason="Repo not found.", status_code=404)
def post(self): reponame = self.get_argument("reponame", None) desc = self.get_argument("description", None) user = self.current_user if not reponame: self.redirect(self.reverse_url("web:create-repo")) return repo = Repo.create(user=user, name=reponame, desc=desc) self.redirect(self.reverse_url("web:repo", user.name, repo.name))
def test_init(self): is_original = True watcher_count = 10 language = "Python" topics = ["Flask"] repo = Repo(is_original, watcher_count, language, topics) self.assertEqual(repo.is_original, is_original) self.assertEqual(repo.watcher_count, watcher_count) self.assertEqual(repo.language, language) self.assertEqual(repo.topics, topics)
def __init__(self, username, repo_name): self.repo = Repo(repo_path(username, repo_name)) self.seen_files = {} try: print "Repo exists" self.repo_model = RepoModel.objects.get(username = username, name = repo_name) except RepoModel.DoesNotExist: self.repo_model = RepoModel(username = username, name = repo_name) self.repo_model.save() self.cached_data = {}
def get(self): query = tornado.escape.url_unescape(self.get_argument("q", "")) if query: pattern = "%" + query + "%" repos = Repo.select().join(User).alias("user").where(Repo.name ** pattern, Repo.private == False) users = User.select().where(User.name ** pattern) else: repos = [] users = [] self.render("search/show.html", query=query, repos=repos, users=users)
def change_status(target_repo, state): from models import Repo # class Repo(Document): # url = StringField(max_length=100,default='Not set yet') # last_used = DateTimeField(default=datetime.now()) # created_on = DateTimeField(default=datetime.now()) # monitoring = StringField(max_length=100,default='Not set yet') # state = StringField(max_length=50,default='Ready') # owner = StringField(max_length=50,default='no') if not use_database: return '' try: repo = Repo.objects.get(url=target_repo) repo.last_used = datetime.today() repo.state = state repo.owner = parent_folder repo.save() except DoesNotExist: repo = Repo() repo.url=target_repo repo.state = state repo.owner = parent_folder repo.save() except Exception as e: print 'database_exception: '+str(e)
def get(self): query = tornado.escape.url_unescape(self.get_argument("q", "")) if query: pattern = "%" + query + "%" repos = (Repo.select().join(User).alias("user").where( Repo.name**pattern)) users = User.select().where(User.name**pattern) else: repos = [] users = [] self.render("search/show.html", query=query, repos=repos, users=users)
async def launch_monthly_job(type): # XXX DRY job_command_last_part = "" if type == "arm": job_command_last_part = " (~ARM~)" elif type == "testing-unstable": job_command_last_part = [" (testing)", " (unstable)"] today = date.today().day for repo in Repo.select().where(Repo.random_job_day == today): task_logger.info(f"Launch monthly job for {repo.name} on day {today} of the month ") await create_job(repo.name, repo.app_list, repo, job_command_last_part)
async def ws_app(request, websocket, app_name): # XXX I don't check if the app exists because this websocket is supposed to # be only loaded from the app page which does this job already app = Repo.select().where(Repo.name == app_name)[0] subscribe(websocket, f"app-jobs-{app.url}") await websocket.send(ujson.dumps({ "action": "init_jobs", "data": Job.select().where(Job.url_or_path == app.url).order_by(-Job.id), })) await websocket.wait_closed()
async def html_job(request, job_id): job = Job.select().where(Job.id == job_id) if job.count == 0: raise NotFound() job = job[0] app = Repo.select().where(Repo.url == job.url_or_path) app = app[0] if app else None return { "job": job, 'app': app, 'relative_path_to_root': '../', 'path': request.path }
def delete(self, username, reponame): # Check whether the key exists and if maybe the last change already is # a delete, else insert a `CSet.DELETE` entry without any blob data. key = self.get_query_argument("key") if username != self.current_user.name: raise HTTPError(403) if not key: raise HTTPError(400) datestr = self.get_query_argument("datetime", None) ts = datestr and date(datestr, QSDATEFMT) or now() try: repo = (Repo.select(Repo.id).join( User).where((User.name == username) & (Repo.name == reponame)).naive().get()) except Repo.DoesNotExist: raise HTTPError(404) sha = shasum(key.encode("utf-8")) try: last = (CSet.select( CSet.time, CSet.type).where((CSet.repo == repo) & (CSet.hkey == sha)).order_by( CSet.time.desc()).limit(1).naive().get()) except CSet.DoesNotExist: # No changeset was found for the given key - # the resource does not exist. raise HTTPError(400) if not ts > last.time: # Appended timestamps must be monotonically increasing! raise HTTPError(400) if last.type == CSet.DELETE: # The resource was deleted already, return instantly. return self.finish() # Insert the new "delete" change. CSet.create(repo=repo, hkey=sha, time=ts, type=CSet.DELETE, len=0)
def test_commits_create(self, get_commits): fixtures = self.data.buffer.read() jsn = json.loads(fixtures.decode()) request = HTTPRequest(self.TEST_REPO['href']) response = HTTPResponse(request, client.OK, buffer=io.BytesIO(fixtures)) future = Future() future.set_result(response) get_commits.return_value = future body = {'href': self.TEST_REPO['href']} response = self.fetch(self.get_app().reverse_url('create'), method='POST', body=urlencode(body).encode()) self.assertIn('всего {}'.format(len(jsn)), response.body.decode()) self.assertEqual(len(jsn), Commit.select().count()) self.assertEqual(1, Repo.select().count())
def change_status(target_repo, state): from models import Repo if not use_database: return '' try: repo = Repo.objects.get(url=target_repo) repo.last_used = datetime.today() repo.state = state repo.owner = parent_folder repo.save() except DoesNotExist: repo = Repo() repo.url = target_repo repo.state = state repo.owner = parent_folder repo.save() except Exception as e: print 'database_exception: ' + str(e)
def get_samples(): """Return a {'class': [reponames]}.""" repos = Repo.load() fetch_dates = [datetime.datetime(*(r.fetch_ymd)) for r in repos] print 'number of repos:', len(repos) latest_fetch = max(fetch_dates) print 'fetched between %s and %s' % (min(fetch_dates), latest_fetch) print filtered = [r for r in repos if 30720 > r.size > 0 and # not foolproof to avoid big repos r.stars > 1 and not r.fork and not 'dotfile' in r.name.lower() and not 'sublime' in r.name.lower() # avoid SublimeText config ] print 'after noise filter:', len(filtered) filtered = [r for r in filtered if ((latest_fetch - r.creation_date) > datetime.timedelta(30)) ] print 'exluding very new:', len(filtered) filtered = [r for r in filtered if r.stars > 5 and classes.score(r) > (1 / 30) ] print 'exluding very unpopular:', len(filtered) score_pairs = [(classes.score(r), r) for r in filtered] score_pairs.sort(key=lambda x: x[0]) # top 1k, bottom 1k. return {'high': [r.name for (score, r) in score_pairs[-1000:]], 'low': [r.name for (score, r) in score_pairs[:1000]], }
def get(self, username): try: user = User.select().where(User.name == username).get() except User.DoesNotExist: raise HTTPError(reason="User not found.", status_code=404) repos = Repo.select().where(Repo.user == user) reposit = repos.iterator() # TODO: Paginate? first = None try: first = reposit.next() except StopIteration: # No repos for user # No need to raise an error, just return empty list in json pass accept = self.request.headers.get("Accept", "") user_url = (self.request.protocol + "://" + self.request.host) if "application/json" in accept or "*/*" in accept: self.set_header("Content-Type", "application/json") self.write('{"username": '******', "repositories": {"list":[') m = ('{{"name": "{0}", "uri": "' + user_url + '/'+username+'/{0}"}}') if first: self.write(m.format(first.name)) for repo in reposit: self.write(', ' + m.format(repo.name)) self.write(']}') self.write('}')
def cargar_datos(self): """Parsea o contido dun ficheiro JSON.""" datos_json = self.cargar_json() for k, v in datos_json.items(): if (k == "usuario"): usuario = v self.usuario = UsuarioGit(usuario["usuario"], usuario["email"]) elif (k == "repos"): for item in v: aux = item repo = Repo( aux["nome"], aux["rama"], aux["remoto"], aux["uri"], aux["directorio"], aux["ten_submodulos"], aux["e_submodulo"] ) self.add_repo(repo)
def test_add_none(self): aggregator = Aggregator() aggregator.add(Repo(None, 10, "Python", ["Flask"])) self.assertEqual(aggregator.count, 1) self.assertEqual(aggregator.original_count, 0) self.assertEqual(aggregator.forked_count, 0)
def RunWithPars(pars, uid): startTime = Common.getCurrentTimeMil() reposFol = 'SavePath/Repos/' if not os.path.exists(config.basedir + '/' + reposFol): os.makedirs(config.basedir + '/' + reposFol) fname = reposFol + Common.gen_rnd_filename() os.makedirs(config.basedir + '/' + fname) ##ADD Field e = Repo() e.cloneFinishDate = "--RUNNING--" e.cloneStartDate = str(startTime) e.repoInfo = '' e.isPrivate = int(pars['isPrivate']) e.path = fname e.repoName = pars['repoName'] e.url = pars['url'] e.userId = uid db.session.add(e) db.session.commit() try: porcelain.clone(pars['url'], config.basedir + '/' + fname) endTime = Common.getCurrentTimeMil() e.cloneFinishDate = str(endTime) db.session.commit() except Exception as ex: print(ex) e.delete() db.session.commit()
import classes from models import Repo import utils def get_classifier(X, y): return RandomForestClassifier( n_estimators=100, max_depth=None, min_samples_split=1, random_state=0, # random seed is static for comparison compute_importances=True, ) if __name__ == '__main__': repos = Repo.load_sample() class_to_id, id_to_class = utils.create_bimap(classes.classes) dict_repos = [] for r in repos: d = {mod: False for mod in utils.stdlib_module_names()} for mod in r.imported_stdlib_modules: d[mod] = True dict_repos.append(d) vectorizer = DictVectorizer(sparse=False) y = np.array([class_to_id[classes.classify(r)] for r in repos]) X = vectorizer.fit_transform(dict_repos)
def load_data(args): start_time = time.time() users = {} repos = {} popular_repos = [] superprojects = {} print "Loading user to repo map" data = open(args[0], 'r') for line in data.readlines(): user_id, repo_id = line.strip().split(':') user_id = int(user_id) repo_id = int(repo_id) if user_id in users: user = users[user_id] else: user = User(user_id) users[user_id] = user if repo_id in repos: repo = repos[repo_id] else: repo = Repo(repo_id) repos[repo_id] = repo repo.is_watched_by(user) user.is_watching(repo) data.close() print "Ordering repos by popularity" popular_repos = sorted(repos.values(), reverse=True, key=lambda x: x.popularity) owners = {} print "Reading repo details" repo_txt = open(args[1], 'r') for line in repo_txt.readlines(): id, other = line.strip().split(':') id = int(id) if id not in repos: continue parts = other.split(',') repo = repos[id] owner, repo.name = parts[0].split('/', 1) repo.creation_date = parts[1] if owner in owners: repo.owner = owners[owner] else: repo.owner = RepoOwner(owner) owners[owner] = repo.owner repo.owner.owns.add(repo) if len(parts) > 2 and int(parts[2]) in repos: repo.forked_from = repos[int(parts[2])] repo.forked_from.forked_by.append(repo) repo_txt.close() # print "Grouping superprojects" # superproject_keys = ['gnome', 'django', 'ruby', 'perl', 'rails'] # for repo in repos.values(): # for key in superproject_keys: # if key in repo.name.lower(): # if key not in superprojects: # superprojects[key] = [] # superprojects[key].append(repo) print "Reading repo language" lang = open(args[2], 'r') for line in lang.readlines(): id, other = line.strip().split(':') id = int(id) if id not in repos: continue parts = other.split(',') repo = repos[id] for part in parts: lang_name, count = part.split(';') repo.langs.append((lang_name, int(count))) lang.close() print "Data read in %d seconds" % (time.time() - start_time) return users, repos, popular_repos, superprojects
for class_name in class_names: repos = class_map[class_name] feature_data = [getattr(repo, feature_name) for repo in repos] class_summaries.append([f(feature_data) for f in funcs]) feature_summaries[feature_name] = np.array(class_summaries) for feature_name, summary in feature_summaries.items(): print feature_name for i, class_name in enumerate(class_names): print ' ', class_name print ' ', '\t'.join(str(e) for e in summary[i]) print '-----' print if __name__ == '__main__': class_map = Repo.load_sample(separate=True) summarize_features(class_map, ['with_stmt_usage', 'compr_usage', 'lambda_usage', 'global_usage', 'gen_exp_usage', 'print_usage', ], sorted(classes)) #summarize_imports(class_map)
def workflow(changed_files, repo_str): global current_repo, current_user current_repo = Repo(name=repo_str, user=current_user, status="starting", started_at=datetime.now(), progress=10) current_repo.save() dolog("deleting the repo if exists") change_status("preparation", 10) delete_forked_repo(repo_str) time.sleep(60) # for GitHub to avoid the unknown issues dolog("forking the repo %s" % repo_str) change_status("forking", 20) repo_url = fork_repo(repo_str) dolog("cloning the repo %s" % repo_url) change_status("cloning", 30) clone_repo(repo_url) # fork_cleanup() change_status("updating the fork", 40) update_fork(repo_str) # update from upstream as the cloned repo is an old fork due to Github limitation dolog("getting jar configurations") change_status("getting configurations", 50) target_files, jar_command = get_jar_config(os.path.join(get_repo_abs_path(), 'jar.cfg')) if target_files is None or jar_command is None: dolog("get jar config failed") change_status("Error getting configurations", 100) delete_local_copy() return "get jar config failed" else: change_status("configuration parsed", 60) dolog("running if target") change_status("running if target", 70) is_found, msg = run_if_target(changed_files, target_files, jar_command) dolog("after running") if is_found: dolog("is found") change_status("pushing changes", 80) push_changes() dolog("after pushing the changes") change_status("creating pull request", 90) if create_pull_request(repo_str): dolog("pull request is True") change_status("pull request created", 100) current_repo.completed_at = datetime.now() current_repo.save() msg += " And pull request is created" dolog("deleting the forked repo attempt") else: dolog("pull request is False") change_status("pull failed to be created", 100) current_repo.completed_at = datetime.now() current_repo.save() msg += " And pull request failed to be created" else: change_status("not found", 100) dolog("not found") current_repo.completed_at = datetime.now() current_repo.save() return msg
async def api_list_app(request): query = Repo.select() return response.json([model_to_dict(x) for x in query.order_by(Repo.name)])
async def ws_apps(request, websocket): subscribe(websocket, "jobs") subscribe(websocket, "apps") # I need to do this because peewee strangely f**k up on join and remove the # subquery fields which breaks everything repos = Repo.raw(''' SELECT "id", "name", "url", "revision", "state", "random_job_day", "job_id", "job_name", "job_state", "created_time", "started_time", "end_time" FROM "repo" AS "t1" INNER JOIN ( SELECT "t1"."id" as "job_id", "t1"."name" as "job_name", "t1"."url_or_path", "t1"."state" as "job_state", "t1"."created_time", "t1"."started_time", "t1"."end_time" FROM "job" AS "t1" INNER JOIN ( SELECT Max("t2"."id") AS "max_id" FROM "job" AS "t2" GROUP BY "t2"."url_or_path" ) AS "t3" ON ("t1"."id" = "t3"."max_id") ) AS "t5" ON ("t5"."url_or_path" = "t1"."url") ORDER BY "name" ''') repos = [{ "id": x.id, "name": x.name, "url": x.url, "revision": x.revision, "state": x.state, "random_job_day": x.random_job_day, "job_id": x.job_id, "job_name": x.job_name, "job_state": x.job_state, "created_time": datetime.strptime(x.created_time.split(".")[0], '%Y-%m-%d %H:%M:%S') if x.created_time else None, "started_time": datetime.strptime(x.started_time.split(".")[0], '%Y-%m-%d %H:%M:%S') if x.started_time else None, "end_time": datetime.strptime(x.end_time.split(".")[0], '%Y-%m-%d %H:%M:%S') if x.end_time else None, } for x in repos] # add apps without jobs selected_repos = {x["id"] for x in repos} for repo in Repo.select().where(Repo.id.not_in(selected_repos)): repos.append({ "id": repo.id, "name": repo.name, "url": repo.url, "revision": repo.revision, "state": repo.state, "random_job_day": repo.random_job_day, "job_id": None, "job_name": None, "job_state": None, "created_time": None, "started_time": None, "end_time": None, }) repos = sorted(repos, key=lambda x: x["name"]) await websocket.send(ujson.dumps({ "action": "init_apps", "data": repos, })) await websocket.wait_closed()
def calculate(f_to_calc, f_to_overwrite, console, download): """Calculate a list of features.""" sys.stdout.write('loading') sys.stdout.flush() repos = Repo.load_sample() seen = 0 total = len(repos) dl_failures = [] calc_failures = [] last_write = datetime.datetime.now() if f_to_calc or f_to_overwrite or download: for repo in repos: seen += 1 success = True if download: success = utils.clone(repo) if not success: dl_failures.append(repo) continue try: if f_to_calc: logging.info("calc: %s", repo) repo.calculate_features(f_to_calc) if f_to_overwrite: logging.info("calc: %s", repo) repo.calculate_features(f_to_overwrite, overwrite=True) repo._clear_support_features() # we're done with this repo now except: print # from status line logging.exception("!problem: %s", repo) calc_failures.append(repo) print progress_bar(seen, total) since_write = datetime.datetime.now() - last_write if since_write > datetime.timedelta(minutes=5): sys.stdout.write("\r(writing results)") sys.stdout.flush() Repo.write_update(repos) last_write = datetime.datetime.now() print # from progress bar line if dl_failures: print "%s failed to download:" % len(dl_failures) for f in dl_failures: print " %s" % f print if calc_failures: print "%s failed during calc:" % len(calc_failures) for f in calc_failures: print " %s" % f print if console: message = ('`repos` contains results;\n' 'use ^d to write out or `exit()` to cancel') code.interact(message, local=locals()) print 'writing out...' Repo.write_update(repos)
async def monitor_apps_lists(monitor_git=False, monitor_only_good_quality_apps=False): "parse apps lists every hour or so to detect new apps" # only support github for now :( async def get_master_commit_sha(url): command = await asyncio.create_subprocess_shell( f"git ls-remote {url} master", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) data = await command.stdout.read() commit_sha = data.decode().strip().replace("\t", " ").split(" ")[0] return commit_sha async with aiohttp.ClientSession() as session: task_logger.info(f"Downloading applist...") async with session.get(APPS_LIST) as resp: data = await resp.json() data = data["apps"] repos = {x.name: x for x in Repo.select()} for app_id, app_data in data.items(): commit_sha = await get_master_commit_sha(app_data["git"]["url"]) if app_data["state"] != "working": task_logger.debug( f"skip {app_id} because state is {app_data['state']}") continue if monitor_only_good_quality_apps: if app_data.get("level") in [None, "?"] or app_data["level"] <= 4: task_logger.debug( f"skip {app_id} because app is not good quality") continue # already know, look to see if there is new commits if app_id in repos: repo = repos[app_id] # but first check if the URL has changed if repo.url != app_data["git"]["url"]: task_logger.info( f"Application {app_id} has changed of url from {repo.url} to {app_data['git']['url']}" ) repo.url = app_data["git"]["url"] repo.save() await broadcast( { "action": "update_app", "data": model_to_dict(repo), }, "apps") # change the url of all jobs that used to have this URL I # guess :/ # this isn't perfect because that could overwrite added by # hand jobs but well... for job in Job.select().where(Job.url_or_path == repo.url, Job.state == "scheduled"): job.url_or_path = repo.url job.save() task_logger.info( f"Updating job {job.name} #{job.id} for {app_id} to {repo.url} since the app has changed of url" ) await broadcast( { "action": "update_job", "data": model_to_dict(job), }, [ "jobs", f"job-{job.id}", f"app-jobs-{job.url_or_path}" ]) # we don't want to do anything else if not monitor_git: continue repo_is_updated = False if repo.revision != commit_sha: task_logger.info( f"Application {app_id} has new commits on github " f"({repo.revision} → {commit_sha}), schedule new job") repo.revision = commit_sha repo.save() repo_is_updated = True await create_job(app_id, repo.url) repo_state = "working" if app_data[ "state"] == "working" else "other_than_working" if repo.state != repo_state: repo.state = repo_state repo.save() repo_is_updated = True if repo.random_job_day is None: repo.random_job_day = random.randint(1, 28) repo.save() repo_is_updated = True if repo_is_updated: await broadcast( { "action": "update_app", "data": model_to_dict(repo), }, "apps") # new app elif app_id not in repos: task_logger.info(f"New application detected: {app_id} " + (", scheduling a new job" if monitor_git else "")) repo = Repo.create( name=app_id, url=app_data["git"]["url"], revision=commit_sha, state="working" if app_data["state"] == "working" else "other_than_working", random_job_day=random.randint(1, 28), ) await broadcast( { "action": "new_app", "data": model_to_dict(repo), }, "apps") if monitor_git: await create_job(app_id, repo.url) await asyncio.sleep(1) # delete apps removed from the list unseen_repos = set(repos.keys()) - set(data.keys()) for repo_name in unseen_repos: repo = repos[repo_name] # delete scheduled jobs first task_logger.info( f"Application {repo_name} has been removed from the app list, start by removing its scheduled job if there are any..." ) for job in Job.select().where(Job.url_or_path == repo.url, Job.state == "scheduled"): await api_stop_job(None, job.id) # not sure this is going to work job_id = job.id task_logger.info( f"Delete scheduled job {job.name} #{job.id} for application {repo_name} because the application is being deleted." ) data = model_to_dict(job) job.delete_instance() await broadcast({ "action": "delete_job", "data": data, }, ["jobs", f"job-{job_id}", f"app-jobs-{job.url_or_path}"]) task_logger.info( f"Delete application {repo_name} because it has been removed from the apps list." ) data = model_to_dict(repo) repo.delete_instance() await broadcast({ "action": "delete_app", "data": data, }, "apps")
def create_new_repo(url,branch): repo = Repo(url = url, branch = branch) repo.save() return repo
#benchmark(PassiveAggressiveClassifier(n_iter=50), X, y, feature_names) #print 'kNN' #benchmark(KNeighborsClassifier(n_neighbors=10), X, y, feature_names) #print 'SGD' #benchmark(SGDClassifier(n_jobs=-1, alpha=.0001, n_iter=np.ceil(10**3), penalty="elasticnet", shuffle=True), # X, y, feature_names) #print 'nearest centroid' #benchmark(NearestCentroid(), X, y, feature_names) #print 'naive bayes' #benchmark(MultinomialNB(alpha=.01), X, y, feature_names) #print 'naive bayes (bernoulli)' #benchmark(BernoulliNB(alpha=.01), X, y, feature_names) #classify(X, y, id_to_class, vec) # classify(select_by_pca(X, y), y, id_to_class, vec) if __name__ == '__main__': ignore = ['imported_modules'] #ignore += ['imported_stdlib_modules'] features = [f for f in all_features if f not in ignore] #features = ['imported_stdlib_modules'] _run(Repo.load_sample(), features)
class Analyzer(object): def __init__(self, username, repo_name): self.repo = Repo(repo_path(username, repo_name)) self.seen_files = {} try: print "Repo exists" self.repo_model = RepoModel.objects.get(username = username, name = repo_name) except RepoModel.DoesNotExist: self.repo_model = RepoModel(username = username, name = repo_name) self.repo_model.save() self.cached_data = {} def is_python_file(self, diff): if diff.a_blob and re.search('.*\.py$', diff.a_blob.name): return True return False def get_classes(self, entities): return [e for e in entities if type(e) == ast.ClassDef] def get_functions(self, entities): return [e for e in entities if type(e) == ast.FunctionDef] def get_all_funcs_from_body(self, body): funcs = self.get_functions(body) classes = self.get_classes(body) for c in classes: funcs = funcs + self.get_all_funcs_from_body(c.body) for f in funcs: funcs = funcs + self.get_all_funcs_from_body(f.body) return funcs def read_diffs(self, diffs): new_funcs = [] files_changed = [] for diff in diffs: if diff.a_blob and diff.b_blob: a_blob_text = diff.a_blob.data_stream.read() b_blob_text = diff.b_blob.data_stream.read() try: a_syntax_tree = ast.parse(a_blob_text) except (ValueError, SyntaxError, TypeError) as e: #Someone has committed some crap that's not valid python, #carry on... continue a_entities = a_syntax_tree.body a_funcs = self.get_all_funcs_from_body(a_entities) a_func_names = [f.name for f in a_funcs] file_name = diff.a_blob.abspath + diff.a_blob.name files_changed.append(file_name) if not self.seen_files.get(file_name, False): #This is a new file, so ALL functions contained within it are new self.seen_files[file_name] = True new_funcs = new_funcs + [(diff.a_blob.abspath, fname) for fname in a_func_names] print "New file!" print new_funcs else: #Not a new file, get what has changed, so get the next blob, parse it, and get the #functions from it. #Get the syntax_tree for the second blob try: b_syntax_tree = ast.parse(b_blob_text) except (ValueError, SyntaxError, TypeError) as e: #Someone has committed some crap that's not valid python, #carry on... continue b_entities = b_syntax_tree.body b_funcs = self.get_all_funcs_from_body(b_entities) b_func_names = [f.name for f in b_funcs] #xor the functions new_in_this_diff = list(set(a_func_names) ^ set(b_func_names)) new_funcs = new_funcs + [(diff.a_blob.abspath, fname) for fname in new_in_this_diff] return new_funcs, files_changed def store(self, commit, new_funcs, files_changed): name = commit.author.name date = commit.committed_date self.cached_data[commit.hexsha] = { 'name' : name, 'date' : date, 'funcs' : new_funcs, 'files_changed' : files_changed } if(len(self.cached_data.keys()) > 30): with transaction.commit_on_success(): self.do_save() def do_save(self): for hexsha in self.cached_data: val = self.cached_data[hexsha] try: actor = Actor.objects.get(full_name = val['name']) except Actor.DoesNotExist: actor = Actor(full_name = val['name']) actor.save() #Create the actor try: commit = Commit.objects.get(hexsha = hexsha) except Commit.DoesNotExist: commit = Commit(hexsha = hexsha, repo = self.repo_model, actor = actor) commit.save() for path, fun in val['funcs']: if not Function.objects.filter(name = fun, path = path).exists(): fmodel = Function(name = fun, commit = commit, path = path) fmodel.save() print "Saved `%s` : `%s`" % (path[-16:], fun) for file_name in val['files_changed']: FileChange(path = file_name, actor = actor, commit = commit).save() self.cached_data.clear() def walk_commits(self): #This uses a lot of memory, but...I don't see another way to go backwards #in git python commits = [] for c in self.repo.iter_commits(): commits.append(c) #pop the first commit off, so that all commits in the loop will have a parent commits.pop() while len(commits) > 0: commit = commits.pop() #Create a list of diffs based on the parent (aka commit before this commit) try: diffs = commit.diff(commit.parents[0]) diffs = [d for d in diffs if self.is_python_file(d)] new_funcs, files_changed = self.read_diffs(diffs) self.store(commit, new_funcs, files_changed) except LookupError: #This seems to be a bug in PyGit maybe? #seems to throw this sometimes, not much we can do here... continue
def get_repo_count(): return Repo.select().count()
def get(self, username, reponame): timemap = self.get_query_argument("timemap", "false") == "true" index = self.get_query_argument("index", "false") == "true" key = self.get_query_argument("key", None) if (index and timemap) or (index and key) or (timemap and not key): raise HTTPError(400) if self.get_query_argument("datetime", None): datestr = self.get_query_argument("datetime") ts = date(datestr, QSDATEFMT) elif "Accept-Datetime" in self.request.headers: datestr = self.request.headers.get("Accept-Datetime") ts = date(datestr, RFC1123DATEFMT) else: ts = now() try: repo = (Repo.select(Repo.id).join( User).where((User.name == username) & (Repo.name == reponame)).naive().get()) except Repo.DoesNotExist: raise HTTPError(404) if key and not timemap: # Recreate the resource for the given key in its latest state - # if no `datetime` was provided - or in the state it was in at # the time indicated by the passed `datetime` argument. self.set_header("Content-Type", "application/n-quads") self.set_header("Vary", "accept-datetime") sha = shasum(key.encode("utf-8")) # Fetch all relevant changes from the last "non-delta" onwards, # ordered by time. The returned delta-chain consists of either: # a snapshot followed by 0 or more deltas, or # a single delete. chain = list( CSet.select(CSet.time, CSet.type).where((CSet.repo == repo) & ( CSet.hkey == sha) & (CSet.time <= ts) & (CSet.time >= SQL( "COALESCE((SELECT time FROM cset " "WHERE repo_id = %s " "AND hkey_id = %s " "AND time <= %s " "AND type != %s " "ORDER BY time DESC " "LIMIT 1), 0)", repo.id, sha, ts, CSet.DELTA))). order_by(CSet.time).naive()) if len(chain) == 0: # A resource does not exist for the given key. raise HTTPError(404) timegate_url = (self.request.protocol + "://" + self.request.host + self.request.path) timemap_url = (self.request.protocol + "://" + self.request.host + self.request.uri + "&timemap=true") self.set_header( "Link", '<%s>; rel="original"' ', <%s>; rel="timegate"' ', <%s>; rel="timemap"' % (key, timegate_url, timemap_url)) self.set_header("Memento-Datetime", chain[-1].time.strftime(RFC1123DATEFMT)) if chain[0].type == CSet.DELETE: # The last change was a delete. Return a 404 response with # appropriate "Link" and "Memento-Datetime" headers. raise HTTPError(404) # Load the data required in order to restore the resource state. blobs = (Blob.select(Blob.data).where( (Blob.repo == repo) & (Blob.hkey == sha) & (Blob.time << map(lambda e: e.time, chain))).order_by( Blob.time).naive()) if len(chain) == 1: # Special case, where we can simply return # the blob data of the snapshot. snap = blobs.first().data return self.finish(decompress(snap)) stmts = set() for i, blob in enumerate(blobs.iterator()): data = decompress(blob.data) if i == 0: # Base snapshot for the delta chain stmts.update(data.splitlines()) else: for line in data.splitlines(): mode, stmt = line[0], line[2:] if mode == "A": stmts.add(stmt) else: stmts.discard(stmt) self.write(join(stmts, "\n")) elif key and timemap: # Generate a timemap containing historic change information # for the requested key. The timemap is in the default link-format # or as JSON (http://mementoweb.org/guide/timemap-json/). sha = shasum(key.encode("utf-8")) csets = (CSet.select( CSet.time).where((CSet.repo == repo) & (CSet.hkey == sha)).order_by( CSet.time.desc()).naive()) # TODO: Paginate? csit = csets.iterator() try: first = csit.next() except StopIteration: # Resource for given key does not exist. raise HTTPError(404) req = self.request base = req.protocol + "://" + req.host + req.path accept = self.request.headers.get("Accept", "") if "application/json" in accept or "*/*" in accept: self.set_header("Content-Type", "application/json") self.write('{"original_uri": ' + json_encode(key)) self.write(', "mementos": {"list":[') m = ('{{"datetime": "{0}", "uri": "' + base + '?key=' + url_escape(key) + '&datetime={1}"}}') self.write( m.format(first.time.isoformat(), first.time.strftime(QSDATEFMT))) for cs in csit: self.write(', ' + m.format(cs.time.isoformat(), cs.time.strftime(QSDATEFMT))) self.write(']}') self.write('}') else: m = (',\n' '<' + base + '?key=' + url_escape(key) + '&datetime={0}>' '; rel="memento"' '; datetime="{1}"' '; type="application/n-quads"') self.set_header("Content-Type", "application/link-format") self.write('<' + key + '>; rel="original"') self.write( m.format(first.time.strftime(QSDATEFMT), first.time.strftime(RFC1123DATEFMT))) for cs in csit: self.write( m.format(cs.time.strftime(QSDATEFMT), cs.time.strftime(RFC1123DATEFMT))) elif index: # Generate an index of all URIs contained in the dataset at the # provided point in time or in its current state. self.set_header("Vary", "accept-datetime") self.set_header("Content-Type", "text/plain") page = int(self.get_query_argument("page", "1")) # Subquery for selecting max. time per hkey group mx = (CSet.select( CSet.hkey, fn.Max(CSet.time).alias("maxtime")).where( (CSet.repo == repo) & (CSet.time <= ts)).group_by( CSet.hkey).order_by(CSet.hkey).paginate( page, INDEX_PAGE_SIZE).alias("mx")) # Query for all the relevant csets (those with max. time values) cs = (CSet.select(CSet.hkey, CSet.time).join( mx, on=((CSet.hkey == mx.c.hkey_id) & (CSet.time == mx.c.maxtime) )).where((CSet.repo == repo) & (CSet.type != CSet.DELETE)).alias("cs")) # Join with the hmap table to retrieve the plain key values hm = (HMap.select(HMap.val).join( cs, on=(HMap.sha == cs.c.hkey_id)).naive()) for h in hm.iterator(): self.write(h.val + "\n") else: raise HTTPError(400)
def fetch(): sys.stdout.write('loading') sys.stdout.flush() repos = Repo.load_sample() authors = {author.login: author for author in Author.load(FILE)} seen = 0 total = len(repos) failures = [] last_write = datetime.datetime.now() el = Elaborator() for repo in repos: seen += 1 if repo.username in authors: logging.info("already fetched %s", repo.username) continue try: gh_data = el._gh_request( 'GET', '/users/' + repo.username ) except: #loop really needs to keep running logging.exception("problem! %s", repo) failures.append(repo) continue authors[repo.username] = Author(**{key: gh_data.get(key, None) for key in ['login', # "octocat" 'id', # 1 'avatar_url', # "https://github.com/images/error/octocat_happy.gif" 'gravatar_id', # "somehexcode" 'url', # "https://api.github.com/users/octocat" 'name', # "monalisa octocat" 'company', # "GitHub" 'blog', # "https://github.com/blog" 'location', # "San Francisco" 'email', # "*****@*****.**" 'hireable', # false 'bio', # "There once was..." 'public_repos', # 2 'public_gists', # 1 'followers', # 20 'following', # 0 'html_url', # "https://github.com/octocat" 'created_at', # "2008-01-14T04:33:35Z" 'type', # "User" ]}) logging.info("fetched %s", repo.username) progress_bar(seen, total) since_write = datetime.datetime.now() - last_write if since_write > datetime.timedelta(minutes=5): sys.stdout.write("\r(writing results)") sys.stdout.flush() Author.dump(authors.values(), FILE) last_write = datetime.datetime.now() print # from progress bar line if failures: print "%s failures:" % len(failures) for f in failures: print " %s" % f print print 'writing out...' Author.dump(authors.values(), FILE)
def post(self): parsed_args = parser.parse_args() repo = Repo(creator=parsed_args['creator'], name=parsed_args['name']) session.add(repo) session.commit() return repo, 201
def put(self, username, reponame): # Create a new revision of the resource specified by `key`. fmt = self.request.headers.get("Content-Type", "application/n-triples") key = self.get_query_argument("key", None) if username != self.current_user.name: raise HTTPError(403) if not key: raise HTTPError(400) datestr = self.get_query_argument("datetime", None) ts = datestr and date(datestr, QSDATEFMT) or now() try: repo = (Repo.select(Repo.id).join( User).where((User.name == username) & (Repo.name == reponame)).naive().get()) except Repo.DoesNotExist: raise HTTPError(404) sha = shasum(key.encode("utf-8")) chain = list( CSet.select(CSet.time, CSet.type, CSet.len).where( (CSet.repo == repo) & (CSet.hkey == sha) & (CSet.time >= SQL( "COALESCE((SELECT time FROM cset " "WHERE repo_id = %s " "AND hkey_id = %s " "AND type != %s " "ORDER BY time DESC " "LIMIT 1), 0)", repo.id, sha, CSet.DELTA))).order_by( CSet.time).naive()) if len(chain) > 0 and not ts > chain[-1].time: # Appended timestamps must be monotonically increasing! raise HTTPError(400) if len(chain) == 0: # Mapping for `key` likely does not exist: # Store the SHA-to-KEY mapping in HMap, # looking out for possible collisions. try: HMap.create(sha=sha, val=key) except IntegrityError: val = HMap.select(HMap.val).where(HMap.sha == sha).scalar() if val != key: raise HTTPError(500) # Parse and normalize into a set of N-Quad lines stmts = parse(self.request.body, fmt) snapc = compress(join(stmts, "\n")) if len(chain) == 0 or chain[0].type == CSet.DELETE: # Provide dummy value for `patch` which is never stored. # If we get here, we always store a snapshot later on! patch = "" else: # Reconstruct the previous state of the resource prev = set() blobs = (Blob.select(Blob.data).where( (Blob.repo == repo) & (Blob.hkey == sha) & (Blob.time << map(lambda e: e.time, chain))).order_by( Blob.time).naive()) for i, blob in enumerate(blobs.iterator()): data = decompress(blob.data) if i == 0: # Base snapshot for the delta chain prev.update(data.splitlines()) else: for line in data.splitlines(): mode, stmt = line[0], line[2:] if mode == "A": prev.add(stmt) else: prev.discard(stmt) if stmts == prev: # No changes, nothing to be done. Bail out. return self.finish() patch = compress( join( map(lambda s: "D " + s, prev - stmts) + map(lambda s: "A " + s, stmts - prev), "\n")) # Calculate the accumulated size of the delta chain including # the (potential) patch from the previous to the pushed state. acclen = reduce(lambda s, e: s + e.len, chain[1:], 0) + len(patch) blen = len(chain) > 0 and chain[0].len or 0 # base length if (len(chain) == 0 or chain[0].type == CSet.DELETE or len(snapc) <= len(patch) or SNAPF * blen <= acclen): # Store the current state as a new snapshot Blob.create(repo=repo, hkey=sha, time=ts, data=snapc) CSet.create(repo=repo, hkey=sha, time=ts, type=CSet.SNAPSHOT, len=len(snapc)) else: # Store a directed delta between the previous and current state Blob.create(repo=repo, hkey=sha, time=ts, data=patch) CSet.create(repo=repo, hkey=sha, time=ts, type=CSet.DELTA, len=len(patch))