Exemplo n.º 1
0
def add_repo(project_id):
    data = request.get_json()
    project = Project.query.get(project_id)
    if project is None:
        return jsonify(message="Project not found"), 404
    if not project.course.is_user(current_user.id):
        return jsonify(message="Unauthorized to add project"), 403
    repo = project.get_repo_for_user(current_user.id)
    if repo:
        return jsonify(message="Already added repo for project"), 400

    repo = Repo(project_id=project.id, name=data['repo_name'])
    repo.user_id = current_user.id
    if project.type is not ProjectTypes.IND:
        g = project.get_group_for_user(current_user.id)
        if g is None:
            return jsonify(message="Not part of group"), 400
        repo.group_id = g.id

    webhook = add_webhook(repo.name, current_user.oauth_token)
    if 'id' not in webhook:
        return jsonify(error="Could not create webhook", data=webhook), 401
    repo.webhook_id = webhook['id']

    db.session.add(repo)
    db.session.commit()
    return jsonify(repo=ps.RepoSchema().dump(repo)), 201
 def test_add_topics(self):
     aggregator = Aggregator()
     aggregator.add(Repo(False, 10, "python", ["Flask"]))
     self.assertEqual(aggregator.topics, {"flask": 1})
     aggregator.add(Repo(False, 10, "python", ["flask"]))
     self.assertEqual(aggregator.topics, {"flask": 2})
     aggregator.add(Repo(False, 10, "python", ["flask", "numpy"]))
     self.assertEqual(aggregator.topics, {"flask": 3, "numpy": 1})
 def test_add_languages(self):
     aggregator = Aggregator()
     aggregator.add(Repo(False, 10, "Python", ["Flask"]))
     self.assertEqual(aggregator.languages, {"python": 1})
     aggregator.add(Repo(False, 10, "python", ["Flask"]))
     self.assertEqual(aggregator.languages, {"python": 2})
     aggregator.add(Repo(False, 10, "java", ["Flask"]))
     self.assertEqual(aggregator.languages, {"python": 2, "java": 1})
     aggregator.add(Repo(False, 10, None, ["Flask"]))
     self.assertEqual(aggregator.languages, {"python": 2, "java": 1})
Exemplo n.º 4
0
    def create(self, request, *args, **kwargs):
        """ Generates db repo object """
        user = request.data['user']
        path = request.data['path']
        version = request.data['version']

        res = RepoManager(user).create()
        if not version:
            version = res

        r = Repo(user=user, path=path, version=version)
        r.save()

        rs = RepoSerializer(r)
        return Response(rs.data)
Exemplo n.º 5
0
    def create(self, request, *args, **kwargs):
        """ Generates db repo object """
        user = request.data['user']
        path = request.data['path']
        version = request.data['version']

        res = RepoManager(user).create()
        if not version:
            version = res

        r = Repo(user=user, path=path, version=version)
        r.save()

        rs = RepoSerializer(r)
        return Response(rs.data)
 def test_asdict(self):
     aggregator = Aggregator()
     expected = {
         "public_repo_count": 0,
         "public_repo_breakdown": {
             "original_repo_count": 0,
             "forked_repo_count": 0
         },
         "watcher_count": 0,
         "languages": [],
         "topics": []
     }
     self.assertEqual(aggregator.asdict(), expected)
     expected2 = {
         "public_repo_count": 1,
         "public_repo_breakdown": {
             "original_repo_count": 1,
             "forked_repo_count": 0
         },
         "watcher_count": 10,
         "languages": [{
             "name": "python",
             "count": 1
         }],
         "topics": [{
             "name": "flask",
             "count": 1
         }]
     }
     aggregator.add(Repo(True, 10, "python", ["flask"]))
     self.assertEqual(aggregator.asdict(), expected2)
Exemplo n.º 7
0
def set_random_day_for_monthy_job():
    for repo in Repo.select().where((Repo.random_job_day == None)):
        repo.random_job_day = random.randint(1, 28)
        task_logger.info(
            f"set random day for monthly job of repo '{repo.name}' at '{repo.random_job_day}'"
        )
        repo.save()
    def __init__(self, sample_dict=None, class_names=None):
        global _sample

        if self.feature_name is None:
            raise Exception('Provide feature_name field in subclass.')

        if sample_dict is None:
            if _sample is None:
                _sample = Repo.load_sample(separate=True)
            sample_dict = _sample

        if class_names is None:
            class_names = sorted(classes)

        self.class_names = class_names
        self.figure, self.ax = plt.subplots()

        hist_data = []
        for clsname in class_names:
            # ie 'class feature data'
            cfd = np.array([getattr(repo, self.feature_name) for repo in sample_dict[clsname]])

            if self.in_class_percentiles is not None:
                min_val, max_val = [np.percentile(cfd, i) for i in self.in_class_percentiles]
                cfd = np.array([e for e in cfd if min_val < e < max_val])

            hist_data.append(cfd)

        self.hist_data = hist_data
Exemplo n.º 9
0
    def get(self, username, reponame):
        try:
            repo = (Repo.select().join(User).alias("user").where(
                (User.name == username) & (Repo.name == reponame)).get())
            title = repo.user.name + "/" + repo.name

            timemap = self.get_query_argument("timemap", "false") == "true"
            datetime = self.get_query_argument("datetime", None)
            key = self.get_query_argument("key", None)

            if key and not timemap:
                self.render("repo/memento.html",
                            repo=repo,
                            key=key,
                            datetime=datetime)
            elif key and timemap:
                self.render("repo/history.html", repo=repo, key=key)
            else:
                cs = (CSet.select(fn.distinct(
                    CSet.hkey)).where(CSet.repo == repo).limit(5).alias("cs"))
                samples = (HMap.select(HMap.val).join(
                    cs, on=(HMap.sha == cs.c.hkey_id)))
                self.render("repo/show.html",
                            title=title,
                            repo=repo,
                            samples=list(samples))
        except Repo.DoesNotExist:
            raise HTTPError(404)
Exemplo n.º 10
0
    def get(self, username, reponame):
        try:
            repo = (Repo.select().join(User).alias("user")
                .where((User.name == username) & (Repo.name == reponame))
                .get())
            title = repo.user.name + "/" + repo.name

            timemap = self.get_query_argument("timemap", "false") == "true"
            datetime = self.get_query_argument("datetime", None)
            key = self.get_query_argument("key", None)

            if key and not timemap:
                self.render("repo/memento.html", repo=repo, key=key,
                    datetime=datetime)
            elif key and timemap:
                self.render("repo/history.html", repo=repo, key=key)
            else:
                cs = (CSet.select(fn.distinct(CSet.hkey))
                    .where(CSet.repo == repo).limit(5).alias("cs"))
                samples = (HMap.select(HMap.val)
                    .join(cs, on=(HMap.sha == cs.c.hkey_id)))
                self.render("repo/show.html", title=title, repo=repo,
                    samples=list(samples))
        except Repo.DoesNotExist:
            raise HTTPError(404)
Exemplo n.º 11
0
    def get(self, username, reponame):
        try:
            repo = (Repo.select().join(User).alias("user")
                .where((User.name == username) & (Repo.name == reponame))
                .get())
            title = repo.user.name + "/" + repo.name

            timemap = self.get_query_argument("timemap", "false") == "true"
            datetime = self.get_query_argument("datetime", None)
            key = self.get_query_argument("key", None)
            index = self.get_query_argument("index", "false") == "true"

            if self.get_query_argument("datetime", None):
                datestr = self.get_query_argument("datetime")
                try:
                    ts = date(datestr, QSDATEFMT)
                except ValueError:
                    raise HTTPError(reason="Invalid format of datetime param", status_code=400)
            elif "Accept-Datetime" in self.request.headers:
                datestr = self.request.headers.get("Accept-Datetime")
                ts = date(datestr, RFC1123DATEFMT)
            else:
                ts = now()
            if key and not timemap:
                chain = revision_logic.get_chain_at_ts(repo, key, ts)
                # use ts of cset instead of now(), to make prev work
                if len(chain) != 0:
                    ts = chain[-1].time

                cs_prev = revision_logic.get_cset_prev_before_ts(repo, key, ts)
                cs_next = revision_logic.get_cset_next_after_ts(repo, key, ts)
                if cs_prev:
                    cs_prev_str = self.request.protocol + "://" + self.request.host + self.request.path + "?key=" + key + "&datetime=" + cs_prev.time.strftime(QSDATEFMT)
                else:
                    cs_prev_str = ""
                if cs_next:
                    cs_next_str = self.request.protocol + "://" + self.request.host + self.request.path + "?key=" + key + "&datetime=" + cs_next.time.strftime(QSDATEFMT)
                else:
                    cs_next_str = "" 
                commit_message = revision_logic.get_commit_message(repo, key, ts)

                self.render("repo/memento.html", repo=repo, key=key, datetime=datetime, cs_next_str=cs_next_str, cs_prev_str=cs_prev_str, commit_message=commit_message)
            elif key and timemap:
                self.render("repo/history.html", repo=repo, key=key)
            elif index:
                cs = (CSet.select(fn.distinct(CSet.hkey)).where((CSet.repo == repo) & (CSet.time <= ts)).alias("cs"))
                key_count = (HMap.select(HMap.val).join(cs, on=(HMap.sha == cs.c.hkey_id))).count()

                page = int(self.get_query_argument("page", "1"))

                hm = revision_logic.get_repo_index(repo, ts, page)
                
                self.render("repo/index.html", repo=repo, title=title, key_count=key_count, page_size=revision_logic.INDEX_PAGE_SIZE, hm=hm, current_page=page)
            else:
                hm = list(revision_logic.get_repo_index(repo, ts, 1, 5))
                # cs = (CSet.select(fn.distinct(CSet.hkey)).where(CSet.repo == repo).limit(5).alias("cs"))
                # samples = (HMap.select(HMap.val).join(cs, on=(HMap.sha == cs.c.hkey_id)))
                self.render("repo/show.html", title=title, repo=repo, hm=hm)
        except Repo.DoesNotExist:
            raise HTTPError(reason="Repo not found.", status_code=404)
Exemplo n.º 12
0
async def html_app(request, app_name):
    app = Repo.select().where(Repo.name == app_name)

    if app.count == 0:
        raise NotFound()

    return {"app": app[0], 'relative_path_to_root': '../../', 'path': request.path}
Exemplo n.º 13
0
async def launch_monthly_job():
    today = date.today().day

    for repo in Repo.select().where(Repo.random_job_day == today):
        task_logger.info(
            f"Launch monthly job for {repo.name} on day {today} of the month ")
        await create_job(repo.name, repo.url)
Exemplo n.º 14
0
    def test_repo_commits(self):
        repo = Repo.create(**self.TEST_REPO)
        repo.save()

        response = self.fetch(self.get_app().reverse_url('view', repo.id))

        self.assertIn(self.MESSAGES['no_records'], response.body.decode())

        self.assertIn(self.MESSAGES['get_more'], response.body.decode())

        for commit in range(self.TEST_COUNT):
            commit_data = self.TEST_COMMIT
            commit_data.update({'repo': repo})
            c = Commit(**commit_data)
            c.save()

        response = self.fetch(self.get_app().reverse_url('view', repo.id))

        self.assertEqual(response.body.decode().count(self.TEST_COMMIT['message']),
                         self.TEST_COUNT)

        self.assertIn(self.MESSAGES['get_more'], response.body.decode())

        repo.next_page = None
        repo.save()

        response = self.fetch(self.get_app().reverse_url('view', repo.id))

        self.assertNotIn(self.MESSAGES['get_more'], response.body.decode())
Exemplo n.º 15
0
def reconcile_db_with_gh(*args, **kwargs):
    ghc = GitHubConnector()
    issues = ghc.get_all_issues()
    repos = ghc.get_all_repos()

    for repo in repos:
        r = Repo(github_id=repo.id, name=repo.name)
        r.save()

    for issue in issues:
        i = Issue(github_id=issue.id)
        i.title = issue.title
        i.number = issue.number
        i.repo = Repo.objects.get(name=issue.repository[1])
        i.save()

    print "Not only did your task run successfully, but you're damned good looking too."
Exemplo n.º 16
0
def add_repository(url, comment, user_id):
    repo = Repo(url=url,
                timestamp=datetime.datetime.now(),
                user_id=user_id,
                comment=comment)
    db.session.add(repo)
    db.session.commit()
    return repo
Exemplo n.º 17
0
 def get(self, username, reponame):
     try:
         repo = Repo.select().join(User).alias("user").where((User.name == username) & (Repo.name == reponame)).get()
         if not repo.private:
             self._get(repo)
         else:
             self._getAuth(repo)
     except Repo.DoesNotExist:
         raise HTTPError(reason="Repo not found.", status_code=404)
Exemplo n.º 18
0
 def post(self):
     reponame = self.get_argument("reponame", None)
     desc = self.get_argument("description", None)
     user = self.current_user
     if not reponame:
         self.redirect(self.reverse_url("web:create-repo"))
         return
     repo = Repo.create(user=user, name=reponame, desc=desc)
     self.redirect(self.reverse_url("web:repo", user.name, repo.name))
Exemplo n.º 19
0
 def post(self):
     reponame = self.get_argument("reponame", None)
     desc = self.get_argument("description", None)
     user = self.current_user
     if not reponame:
         self.redirect(self.reverse_url("web:create-repo"))
         return
     repo = Repo.create(user=user, name=reponame, desc=desc)
     self.redirect(self.reverse_url("web:repo", user.name, repo.name))
Exemplo n.º 20
0
 def test_init(self):
     is_original = True
     watcher_count = 10
     language = "Python"
     topics = ["Flask"]
     repo = Repo(is_original, watcher_count, language, topics)
     self.assertEqual(repo.is_original, is_original)
     self.assertEqual(repo.watcher_count, watcher_count)
     self.assertEqual(repo.language, language)
     self.assertEqual(repo.topics, topics)
Exemplo n.º 21
0
Arquivo: util.py Projeto: rozap/cs410
    def __init__(self, username, repo_name):
        self.repo = Repo(repo_path(username, repo_name))
        self.seen_files = {}
        try:
            print "Repo exists"
            self.repo_model = RepoModel.objects.get(username = username, name = repo_name)
        except RepoModel.DoesNotExist:
            self.repo_model = RepoModel(username = username, name = repo_name)
            self.repo_model.save()

        self.cached_data = {}
Exemplo n.º 22
0
    def get(self):
        query = tornado.escape.url_unescape(self.get_argument("q", ""))

        if query:
            pattern = "%" + query + "%"
            repos = Repo.select().join(User).alias("user").where(Repo.name ** pattern, Repo.private == False)
            users = User.select().where(User.name ** pattern)
        else:
            repos = []
            users = []

        self.render("search/show.html", query=query, repos=repos, users=users)
Exemplo n.º 23
0
def change_status(target_repo, state):
    from models import Repo
#     class Repo(Document):
#         url = StringField(max_length=100,default='Not set yet')
#         last_used = DateTimeField(default=datetime.now())
#         created_on = DateTimeField(default=datetime.now())
#         monitoring = StringField(max_length=100,default='Not set yet')
#         state = StringField(max_length=50,default='Ready')
#         owner = StringField(max_length=50,default='no')
    
    
    if not use_database:
        return ''
    try:
        repo = Repo.objects.get(url=target_repo)
        repo.last_used = datetime.today()
        repo.state = state
        repo.owner = parent_folder
        repo.save()
    except DoesNotExist:
        repo = Repo()
        repo.url=target_repo
        repo.state = state
        repo.owner = parent_folder
        repo.save()
    except Exception as e:
        print 'database_exception: '+str(e)
Exemplo n.º 24
0
    def get(self):
        query = tornado.escape.url_unescape(self.get_argument("q", ""))

        if query:
            pattern = "%" + query + "%"
            repos = (Repo.select().join(User).alias("user").where(
                Repo.name**pattern))
            users = User.select().where(User.name**pattern)
        else:
            repos = []
            users = []

        self.render("search/show.html", query=query, repos=repos, users=users)
Exemplo n.º 25
0
async def launch_monthly_job(type):
    # XXX DRY
    job_command_last_part = ""
    if type == "arm":
        job_command_last_part = " (~ARM~)"
    elif type == "testing-unstable":
        job_command_last_part = [" (testing)", " (unstable)"]

    today = date.today().day

    for repo in Repo.select().where(Repo.random_job_day == today):
        task_logger.info(f"Launch monthly job for {repo.name} on day {today} of the month ")
        await create_job(repo.name, repo.app_list, repo, job_command_last_part)
Exemplo n.º 26
0
async def ws_app(request, websocket, app_name):
    # XXX I don't check if the app exists because this websocket is supposed to
    # be only loaded from the app page which does this job already
    app = Repo.select().where(Repo.name == app_name)[0]

    subscribe(websocket, f"app-jobs-{app.url}")

    await websocket.send(ujson.dumps({
        "action": "init_jobs",
        "data": Job.select().where(Job.url_or_path ==
                                   app.url).order_by(-Job.id),
    }))

    await websocket.wait_closed()
Exemplo n.º 27
0
async def html_job(request, job_id):
    job = Job.select().where(Job.id == job_id)

    if job.count == 0:
        raise NotFound()

    job = job[0]

    app = Repo.select().where(Repo.url == job.url_or_path)
    app = app[0] if app else None

    return {
        "job": job,
        'app': app,
        'relative_path_to_root': '../',
        'path': request.path
    }
Exemplo n.º 28
0
    def delete(self, username, reponame):
        # Check whether the key exists and if maybe the last change already is
        # a delete, else insert a `CSet.DELETE` entry without any blob data.

        key = self.get_query_argument("key")

        if username != self.current_user.name:
            raise HTTPError(403)

        if not key:
            raise HTTPError(400)

        datestr = self.get_query_argument("datetime", None)
        ts = datestr and date(datestr, QSDATEFMT) or now()

        try:
            repo = (Repo.select(Repo.id).join(
                User).where((User.name == username)
                            & (Repo.name == reponame)).naive().get())
        except Repo.DoesNotExist:
            raise HTTPError(404)

        sha = shasum(key.encode("utf-8"))

        try:
            last = (CSet.select(
                CSet.time,
                CSet.type).where((CSet.repo == repo)
                                 & (CSet.hkey == sha)).order_by(
                                     CSet.time.desc()).limit(1).naive().get())
        except CSet.DoesNotExist:
            # No changeset was found for the given key -
            # the resource does not exist.
            raise HTTPError(400)

        if not ts > last.time:
            # Appended timestamps must be monotonically increasing!
            raise HTTPError(400)

        if last.type == CSet.DELETE:
            # The resource was deleted already, return instantly.
            return self.finish()

        # Insert the new "delete" change.
        CSet.create(repo=repo, hkey=sha, time=ts, type=CSet.DELETE, len=0)
Exemplo n.º 29
0
    def test_commits_create(self, get_commits):
        fixtures = self.data.buffer.read()
        jsn = json.loads(fixtures.decode())

        request = HTTPRequest(self.TEST_REPO['href'])
        response = HTTPResponse(request, client.OK, buffer=io.BytesIO(fixtures))

        future = Future()
        future.set_result(response)
        get_commits.return_value = future

        body = {'href': self.TEST_REPO['href']}
        response = self.fetch(self.get_app().reverse_url('create'), method='POST',
                              body=urlencode(body).encode())

        self.assertIn('всего {}'.format(len(jsn)), response.body.decode())
        self.assertEqual(len(jsn), Commit.select().count())
        self.assertEqual(1, Repo.select().count())
Exemplo n.º 30
0
def change_status(target_repo, state):
    from models import Repo
    if not use_database:
        return ''
    try:
        repo = Repo.objects.get(url=target_repo)
        repo.last_used = datetime.today()
        repo.state = state
        repo.owner = parent_folder
        repo.save()
    except DoesNotExist:
        repo = Repo()
        repo.url = target_repo
        repo.state = state
        repo.owner = parent_folder
        repo.save()
    except Exception as e:
        print 'database_exception: ' + str(e)
def get_samples():
    """Return a {'class': [reponames]}."""

    repos = Repo.load()
    fetch_dates = [datetime.datetime(*(r.fetch_ymd)) for r in repos]

    print 'number of repos:', len(repos)

    latest_fetch = max(fetch_dates)
    print 'fetched between %s and %s' % (min(fetch_dates), latest_fetch)
    print

    filtered = [r for r in repos if
                30720 > r.size > 0 and  # not foolproof to avoid big repos
                r.stars > 1 and
                not r.fork and
                not 'dotfile' in r.name.lower() and
                not 'sublime' in r.name.lower()  # avoid SublimeText config
                ]
    print 'after noise filter:', len(filtered)

    filtered = [r for r in filtered if
                ((latest_fetch - r.creation_date) >
                 datetime.timedelta(30))
                ]
    print 'exluding very new:', len(filtered)

    filtered = [r for r in filtered if
                r.stars > 5 and
                classes.score(r) > (1 / 30)
                ]
    print 'exluding very unpopular:', len(filtered)

    score_pairs = [(classes.score(r), r) for r in filtered]
    score_pairs.sort(key=lambda x: x[0])

    # top 1k, bottom 1k.
    return {'high': [r.name for (score, r) in score_pairs[-1000:]],
            'low': [r.name for (score, r) in score_pairs[:1000]],
            }
Exemplo n.º 32
0
    def get(self, username):
        try:
            user = User.select().where(User.name == username).get()
        except User.DoesNotExist:
            raise HTTPError(reason="User not found.", status_code=404)
        
        repos = Repo.select().where(Repo.user == user)
        reposit = repos.iterator()

        # TODO: Paginate?

        first = None
        try:
            first = reposit.next()
        except StopIteration:
            # No repos for user
            # No need to raise an error, just return empty list in json
            pass
            

        accept = self.request.headers.get("Accept", "")
        user_url = (self.request.protocol + "://" + self.request.host)

        if "application/json" in accept or "*/*" in accept:
            self.set_header("Content-Type", "application/json")

            self.write('{"username": '******', "repositories": {"list":[')

            m = ('{{"name": "{0}", "uri": "' + user_url +
                 '/'+username+'/{0}"}}')

            if first:
                self.write(m.format(first.name))

            for repo in reposit:
                self.write(', ' + m.format(repo.name))

            self.write(']}')
            self.write('}')
Exemplo n.º 33
0
    def cargar_datos(self):
        """Parsea o contido dun ficheiro JSON."""

        datos_json = self.cargar_json()

        for k, v in datos_json.items():
            if (k == "usuario"):
                usuario = v
                self.usuario = UsuarioGit(usuario["usuario"], usuario["email"])
            elif (k == "repos"):
                for item in v:
                    aux = item
                    repo = Repo(
                                aux["nome"],
                                aux["rama"],
                                aux["remoto"],
                                aux["uri"],
                                aux["directorio"],
                                aux["ten_submodulos"],
                                aux["e_submodulo"]
                            )
                    self.add_repo(repo)
Exemplo n.º 34
0
def change_status(target_repo, state):
    from models import Repo
    if not use_database:
        return ''
    try:
        repo = Repo.objects.get(url=target_repo)
        repo.last_used = datetime.today()
        repo.state = state
        repo.owner = parent_folder
        repo.save()
    except DoesNotExist:
        repo = Repo()
        repo.url = target_repo
        repo.state = state
        repo.owner = parent_folder
        repo.save()
    except Exception as e:
        print 'database_exception: ' + str(e)
Exemplo n.º 35
0
 def test_add_none(self):
     aggregator = Aggregator()
     aggregator.add(Repo(None, 10, "Python", ["Flask"]))
     self.assertEqual(aggregator.count, 1)
     self.assertEqual(aggregator.original_count, 0)
     self.assertEqual(aggregator.forked_count, 0)
Exemplo n.º 36
0
def RunWithPars(pars, uid):
    startTime = Common.getCurrentTimeMil()
    reposFol = 'SavePath/Repos/'
    if not os.path.exists(config.basedir + '/' + reposFol):
        os.makedirs(config.basedir + '/' + reposFol)
    fname = reposFol + Common.gen_rnd_filename()
    os.makedirs(config.basedir + '/' + fname)
    ##ADD Field
    e = Repo()
    e.cloneFinishDate = "--RUNNING--"
    e.cloneStartDate = str(startTime)
    e.repoInfo = ''
    e.isPrivate = int(pars['isPrivate'])
    e.path = fname
    e.repoName = pars['repoName']
    e.url = pars['url']
    e.userId = uid
    db.session.add(e)
    db.session.commit()

    try:
        porcelain.clone(pars['url'], config.basedir + '/' + fname)
        endTime = Common.getCurrentTimeMil()
        e.cloneFinishDate = str(endTime)
        db.session.commit()

    except Exception as ex:
        print(ex)
        e.delete()
        db.session.commit()
import classes
from models import Repo
import utils


def get_classifier(X, y):
    return RandomForestClassifier(
        n_estimators=100, max_depth=None, min_samples_split=1,
        random_state=0,  # random seed is static for comparison
        compute_importances=True,
    )


if __name__ == '__main__':
    repos = Repo.load_sample()

    class_to_id, id_to_class = utils.create_bimap(classes.classes)

    dict_repos = []
    for r in repos:
        d = {mod: False for mod in utils.stdlib_module_names()}

        for mod in r.imported_stdlib_modules:
            d[mod] = True
        dict_repos.append(d)

    vectorizer = DictVectorizer(sparse=False)

    y = np.array([class_to_id[classes.classify(r)] for r in repos])
    X = vectorizer.fit_transform(dict_repos)
Exemplo n.º 38
0
def load_data(args):
    start_time = time.time()
    users = {}
    repos = {}
    popular_repos = []
    superprojects = {}

    print "Loading user to repo map"

    data = open(args[0], 'r')

    for line in data.readlines():
        user_id, repo_id = line.strip().split(':')

        user_id = int(user_id)
        repo_id = int(repo_id)

        if user_id in users:
            user = users[user_id]
        else:
            user = User(user_id)
            users[user_id] = user

        if repo_id in repos:
            repo = repos[repo_id]
        else:
            repo = Repo(repo_id)
            repos[repo_id] = repo


        repo.is_watched_by(user)
        user.is_watching(repo)
    
    data.close()

    print "Ordering repos by popularity"
    popular_repos = sorted(repos.values(), reverse=True,
            key=lambda x: x.popularity)

    owners = {}
    print "Reading repo details"
    repo_txt = open(args[1], 'r')
    for line in repo_txt.readlines():
        id, other = line.strip().split(':')
        id = int(id)

        if id not in repos:
            continue

        parts = other.split(',')

        repo = repos[id]

        owner, repo.name = parts[0].split('/', 1)
        repo.creation_date = parts[1]

        if owner in owners:
            repo.owner = owners[owner]
        else:
            repo.owner = RepoOwner(owner)
            owners[owner] = repo.owner

        repo.owner.owns.add(repo)

        if len(parts) > 2 and int(parts[2]) in repos:
            repo.forked_from = repos[int(parts[2])]
            repo.forked_from.forked_by.append(repo)

    repo_txt.close()

#    print "Grouping superprojects" 
#    superproject_keys = ['gnome', 'django', 'ruby', 'perl', 'rails']
#    for repo in repos.values():
#        for key in superproject_keys:
#            if key in repo.name.lower():
#                if key not in superprojects:
#                    superprojects[key] = []
#                superprojects[key].append(repo)

    print "Reading repo language"
    lang = open(args[2], 'r')
    for line in lang.readlines():
        id, other = line.strip().split(':')
        id = int(id)
        
        if id not in repos:
            continue

        parts = other.split(',')

        repo = repos[id]
        for part in parts:
            lang_name, count = part.split(';')
            repo.langs.append((lang_name, int(count)))

    lang.close()

    print "Data read in %d seconds" % (time.time() - start_time)

    return users, repos, popular_repos, superprojects
        for class_name in class_names:
            repos = class_map[class_name]
            feature_data = [getattr(repo, feature_name) for repo in repos]

            class_summaries.append([f(feature_data) for f in funcs])

        feature_summaries[feature_name] = np.array(class_summaries)

    for feature_name, summary in feature_summaries.items():
        print feature_name
        for i, class_name in enumerate(class_names):
            print '  ', class_name
            print '    ', '\t'.join(str(e) for e in summary[i])

        print '-----'
        print


if __name__ == '__main__':
    class_map = Repo.load_sample(separate=True)

    summarize_features(class_map, ['with_stmt_usage',
                                   'compr_usage',
                                   'lambda_usage',
                                   'global_usage',
                                   'gen_exp_usage',
                                   'print_usage',
                                  ], sorted(classes))

    #summarize_imports(class_map)
Exemplo n.º 40
0
def workflow(changed_files, repo_str):
    global current_repo, current_user
    current_repo = Repo(name=repo_str, user=current_user, status="starting", started_at=datetime.now(), progress=10)
    current_repo.save()
    dolog("deleting the repo if exists")
    change_status("preparation", 10)
    delete_forked_repo(repo_str)
    time.sleep(60)  # for GitHub to avoid the unknown issues
    dolog("forking the repo %s" % repo_str)
    change_status("forking", 20)
    repo_url = fork_repo(repo_str)
    dolog("cloning the repo %s" % repo_url)
    change_status("cloning", 30)
    clone_repo(repo_url)
    # fork_cleanup()
    change_status("updating the fork", 40)
    update_fork(repo_str)  # update from upstream as the cloned repo is an old fork due to Github limitation
    dolog("getting jar configurations")
    change_status("getting configurations", 50)
    target_files, jar_command = get_jar_config(os.path.join(get_repo_abs_path(), 'jar.cfg'))
    if target_files is None or jar_command is None:
        dolog("get jar config failed")
        change_status("Error getting configurations", 100)
        delete_local_copy()
        return "get jar config failed"
    else:
        change_status("configuration parsed", 60)
    dolog("running if target")
    change_status("running if target", 70)
    is_found, msg = run_if_target(changed_files, target_files, jar_command)
    dolog("after running")
    if is_found:
        dolog("is found")
        change_status("pushing changes", 80)
        push_changes()
        dolog("after pushing the changes")
        change_status("creating pull request", 90)
        if create_pull_request(repo_str):
            dolog("pull request is True")
            change_status("pull request created", 100)
            current_repo.completed_at = datetime.now()
            current_repo.save()
            msg += " And pull request is created"
            dolog("deleting the forked repo attempt")
        else:
            dolog("pull request is False")
            change_status("pull failed to be created", 100)
            current_repo.completed_at = datetime.now()
            current_repo.save()
            msg += " And pull request failed to be created"
    else:
        change_status("not found", 100)
        dolog("not found")
        current_repo.completed_at = datetime.now()
        current_repo.save()
    return msg
Exemplo n.º 41
0
async def api_list_app(request):
    query = Repo.select()

    return response.json([model_to_dict(x) for x in query.order_by(Repo.name)])
Exemplo n.º 42
0
async def ws_apps(request, websocket):
    subscribe(websocket, "jobs")
    subscribe(websocket, "apps")

    # I need to do this because peewee strangely f**k up on join and remove the
    # subquery fields which breaks everything
    repos = Repo.raw('''
    SELECT
        "id",
        "name",
        "url",
        "revision",
        "state",
        "random_job_day",
        "job_id",
        "job_name",
        "job_state",
        "created_time",
        "started_time",
        "end_time"
    FROM
        "repo" AS "t1"
    INNER JOIN (
        SELECT
            "t1"."id" as "job_id",
            "t1"."name" as "job_name",
            "t1"."url_or_path",
            "t1"."state" as "job_state",
            "t1"."created_time",
            "t1"."started_time",
            "t1"."end_time"
        FROM
            "job" AS "t1"
        INNER JOIN (
            SELECT
                Max("t2"."id") AS "max_id"
            FROM
                "job" AS "t2"
            GROUP BY
                "t2"."url_or_path"
        )
        AS
            "t3"
        ON
            ("t1"."id" = "t3"."max_id")
    ) AS
        "t5"
    ON
        ("t5"."url_or_path" = "t1"."url")
    ORDER BY
        "name"
    ''')

    repos = [{
        "id":
        x.id,
        "name":
        x.name,
        "url":
        x.url,
        "revision":
        x.revision,
        "state":
        x.state,
        "random_job_day":
        x.random_job_day,
        "job_id":
        x.job_id,
        "job_name":
        x.job_name,
        "job_state":
        x.job_state,
        "created_time":
        datetime.strptime(x.created_time.split(".")[0], '%Y-%m-%d %H:%M:%S')
        if x.created_time else None,
        "started_time":
        datetime.strptime(x.started_time.split(".")[0], '%Y-%m-%d %H:%M:%S')
        if x.started_time else None,
        "end_time":
        datetime.strptime(x.end_time.split(".")[0], '%Y-%m-%d %H:%M:%S')
        if x.end_time else None,
    } for x in repos]

    # add apps without jobs
    selected_repos = {x["id"] for x in repos}
    for repo in Repo.select().where(Repo.id.not_in(selected_repos)):
        repos.append({
            "id": repo.id,
            "name": repo.name,
            "url": repo.url,
            "revision": repo.revision,
            "state": repo.state,
            "random_job_day": repo.random_job_day,
            "job_id": None,
            "job_name": None,
            "job_state": None,
            "created_time": None,
            "started_time": None,
            "end_time": None,
        })

    repos = sorted(repos, key=lambda x: x["name"])

    await websocket.send(ujson.dumps({
        "action": "init_apps",
        "data": repos,
    }))

    await websocket.wait_closed()
def calculate(f_to_calc, f_to_overwrite, console, download):
    """Calculate a list of features."""

    sys.stdout.write('loading')
    sys.stdout.flush()
    repos = Repo.load_sample()

    seen = 0
    total = len(repos)
    dl_failures = []
    calc_failures = []
    last_write = datetime.datetime.now()

    if f_to_calc or f_to_overwrite or download:
        for repo in repos:
            seen += 1
            success = True

            if download:
                success = utils.clone(repo)

            if not success:
                dl_failures.append(repo)
                continue

            try:
                if f_to_calc:
                    logging.info("calc: %s", repo)
                    repo.calculate_features(f_to_calc)

                if f_to_overwrite:
                    logging.info("calc: %s", repo)
                    repo.calculate_features(f_to_overwrite, overwrite=True)

                repo._clear_support_features()  # we're done with this repo now
            except:
                print  # from status line
                logging.exception("!problem: %s", repo)
                calc_failures.append(repo)
                print

            progress_bar(seen, total)

            since_write = datetime.datetime.now() - last_write

            if since_write > datetime.timedelta(minutes=5):
                sys.stdout.write("\r(writing results)")
                sys.stdout.flush()
                Repo.write_update(repos)

                last_write = datetime.datetime.now()

    print  # from progress bar line

    if dl_failures:
        print "%s failed to download:" % len(dl_failures)
        for f in dl_failures:
            print "  %s" % f
        print

    if calc_failures:
        print "%s failed during calc:" % len(calc_failures)
        for f in calc_failures:
            print "  %s" % f
        print

    if console:
        message = ('`repos` contains results;\n'
                   'use ^d to write out or `exit()` to cancel')
        code.interact(message, local=locals())

    print 'writing out...'
    Repo.write_update(repos)
Exemplo n.º 44
0
async def monitor_apps_lists(monitor_git=False,
                             monitor_only_good_quality_apps=False):
    "parse apps lists every hour or so to detect new apps"

    # only support github for now :(
    async def get_master_commit_sha(url):
        command = await asyncio.create_subprocess_shell(
            f"git ls-remote {url} master",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE)
        data = await command.stdout.read()
        commit_sha = data.decode().strip().replace("\t", " ").split(" ")[0]
        return commit_sha

    async with aiohttp.ClientSession() as session:
        task_logger.info(f"Downloading applist...")
        async with session.get(APPS_LIST) as resp:
            data = await resp.json()
            data = data["apps"]

    repos = {x.name: x for x in Repo.select()}

    for app_id, app_data in data.items():
        commit_sha = await get_master_commit_sha(app_data["git"]["url"])

        if app_data["state"] != "working":
            task_logger.debug(
                f"skip {app_id} because state is {app_data['state']}")
            continue

        if monitor_only_good_quality_apps:
            if app_data.get("level") in [None, "?"] or app_data["level"] <= 4:
                task_logger.debug(
                    f"skip {app_id} because app is not good quality")
                continue

        # already know, look to see if there is new commits
        if app_id in repos:
            repo = repos[app_id]

            # but first check if the URL has changed
            if repo.url != app_data["git"]["url"]:
                task_logger.info(
                    f"Application {app_id} has changed of url from {repo.url} to {app_data['git']['url']}"
                )

                repo.url = app_data["git"]["url"]
                repo.save()

                await broadcast(
                    {
                        "action": "update_app",
                        "data": model_to_dict(repo),
                    }, "apps")

                # change the url of all jobs that used to have this URL I
                # guess :/
                # this isn't perfect because that could overwrite added by
                # hand jobs but well...
                for job in Job.select().where(Job.url_or_path == repo.url,
                                              Job.state == "scheduled"):
                    job.url_or_path = repo.url
                    job.save()

                    task_logger.info(
                        f"Updating job {job.name} #{job.id} for {app_id} to {repo.url} since the app has changed of url"
                    )

                    await broadcast(
                        {
                            "action": "update_job",
                            "data": model_to_dict(job),
                        }, [
                            "jobs", f"job-{job.id}",
                            f"app-jobs-{job.url_or_path}"
                        ])

            # we don't want to do anything else
            if not monitor_git:
                continue

            repo_is_updated = False
            if repo.revision != commit_sha:
                task_logger.info(
                    f"Application {app_id} has new commits on github "
                    f"({repo.revision} → {commit_sha}), schedule new job")
                repo.revision = commit_sha
                repo.save()
                repo_is_updated = True

                await create_job(app_id, repo.url)

            repo_state = "working" if app_data[
                "state"] == "working" else "other_than_working"

            if repo.state != repo_state:
                repo.state = repo_state
                repo.save()
                repo_is_updated = True

            if repo.random_job_day is None:
                repo.random_job_day = random.randint(1, 28)
                repo.save()
                repo_is_updated = True

            if repo_is_updated:
                await broadcast(
                    {
                        "action": "update_app",
                        "data": model_to_dict(repo),
                    }, "apps")

        # new app
        elif app_id not in repos:
            task_logger.info(f"New application detected: {app_id} " +
                             (", scheduling a new job" if monitor_git else ""))
            repo = Repo.create(
                name=app_id,
                url=app_data["git"]["url"],
                revision=commit_sha,
                state="working"
                if app_data["state"] == "working" else "other_than_working",
                random_job_day=random.randint(1, 28),
            )

            await broadcast(
                {
                    "action": "new_app",
                    "data": model_to_dict(repo),
                }, "apps")

            if monitor_git:
                await create_job(app_id, repo.url)

        await asyncio.sleep(1)

    # delete apps removed from the list
    unseen_repos = set(repos.keys()) - set(data.keys())

    for repo_name in unseen_repos:
        repo = repos[repo_name]

        # delete scheduled jobs first
        task_logger.info(
            f"Application {repo_name} has been removed from the app list, start by removing its scheduled job if there are any..."
        )
        for job in Job.select().where(Job.url_or_path == repo.url,
                                      Job.state == "scheduled"):
            await api_stop_job(None, job.id)  # not sure this is going to work
            job_id = job.id

            task_logger.info(
                f"Delete scheduled job {job.name} #{job.id} for application {repo_name} because the application is being deleted."
            )

            data = model_to_dict(job)
            job.delete_instance()

            await broadcast({
                "action": "delete_job",
                "data": data,
            }, ["jobs", f"job-{job_id}", f"app-jobs-{job.url_or_path}"])

        task_logger.info(
            f"Delete application {repo_name} because it has been removed from the apps list."
        )
        data = model_to_dict(repo)
        repo.delete_instance()

        await broadcast({
            "action": "delete_app",
            "data": data,
        }, "apps")
Exemplo n.º 45
0
 def create_new_repo(url,branch):
     repo = Repo(url = url, branch = branch)
     repo.save()
     return repo
    #benchmark(PassiveAggressiveClassifier(n_iter=50), X, y, feature_names)

    #print 'kNN'
    #benchmark(KNeighborsClassifier(n_neighbors=10), X, y, feature_names)

    #print 'SGD'
    #benchmark(SGDClassifier(n_jobs=-1, alpha=.0001, n_iter=np.ceil(10**3), penalty="elasticnet", shuffle=True),
    #          X, y, feature_names)

    #print 'nearest centroid'
    #benchmark(NearestCentroid(), X, y, feature_names)

    #print 'naive bayes'
    #benchmark(MultinomialNB(alpha=.01), X, y, feature_names)

    #print 'naive bayes (bernoulli)'
    #benchmark(BernoulliNB(alpha=.01), X, y, feature_names)

    #classify(X, y, id_to_class, vec)
    # classify(select_by_pca(X, y), y, id_to_class, vec)


if __name__ == '__main__':
    ignore = ['imported_modules']
    #ignore += ['imported_stdlib_modules']

    features = [f for f in all_features if f not in ignore]
    #features = ['imported_stdlib_modules']

    _run(Repo.load_sample(), features)
Exemplo n.º 47
0
Arquivo: util.py Projeto: rozap/cs410
class Analyzer(object):

    def __init__(self, username, repo_name):
        self.repo = Repo(repo_path(username, repo_name))
        self.seen_files = {}
        try:
            print "Repo exists"
            self.repo_model = RepoModel.objects.get(username = username, name = repo_name)
        except RepoModel.DoesNotExist:
            self.repo_model = RepoModel(username = username, name = repo_name)
            self.repo_model.save()

        self.cached_data = {}



    def is_python_file(self, diff):
        if diff.a_blob and re.search('.*\.py$', diff.a_blob.name):
            return True
        return False


    def get_classes(self, entities):
        return [e for e in entities if type(e) == ast.ClassDef]


    def get_functions(self, entities):
        return [e for e in entities if type(e) == ast.FunctionDef]


    def get_all_funcs_from_body(self, body):
        funcs = self.get_functions(body)
        classes = self.get_classes(body)
        for c in classes:
            funcs = funcs + self.get_all_funcs_from_body(c.body)
        for f in funcs:
            funcs = funcs + self.get_all_funcs_from_body(f.body)
        return funcs



    def read_diffs(self, diffs):
        new_funcs = []
        files_changed = []
        for diff in diffs:

            if diff.a_blob and diff.b_blob:
                a_blob_text = diff.a_blob.data_stream.read()
                b_blob_text = diff.b_blob.data_stream.read()

                try:
                    a_syntax_tree = ast.parse(a_blob_text)
                except (ValueError, SyntaxError, TypeError) as e:
                    #Someone has committed some crap that's not valid python, 
                    #carry on...
                    continue                

                a_entities = a_syntax_tree.body
                a_funcs = self.get_all_funcs_from_body(a_entities)
                a_func_names = [f.name for f in a_funcs]
                
                file_name = diff.a_blob.abspath + diff.a_blob.name
                files_changed.append(file_name)
                if not self.seen_files.get(file_name, False):
                    #This is a new file, so ALL functions contained within it are new
                    self.seen_files[file_name] = True
                    new_funcs = new_funcs + [(diff.a_blob.abspath, fname) for fname in a_func_names]

                    print "New file!" 
                    print new_funcs

                else:
                    #Not a new file, get what has changed, so get the next blob, parse it, and get the 
                    #functions from it. 

                    #Get the syntax_tree for the second blob
                    try:
                        b_syntax_tree = ast.parse(b_blob_text)
                    except (ValueError, SyntaxError, TypeError) as e:
                        #Someone has committed some crap that's not valid python, 
                        #carry on...
                        continue                


                    b_entities = b_syntax_tree.body
                    b_funcs = self.get_all_funcs_from_body(b_entities)
                    b_func_names = [f.name for f in b_funcs]

                    #xor the functions
                    new_in_this_diff = list(set(a_func_names) ^ set(b_func_names))
                    new_funcs = new_funcs + [(diff.a_blob.abspath, fname) for fname in new_in_this_diff]


        return new_funcs, files_changed


    def store(self, commit, new_funcs, files_changed):
        name = commit.author.name
        date = commit.committed_date

        self.cached_data[commit.hexsha] = {
            'name' : name,
            'date' : date,
            'funcs' : new_funcs, 
            'files_changed' : files_changed
        }

        if(len(self.cached_data.keys()) > 30):
            with transaction.commit_on_success():
                self.do_save()


    def do_save(self):
        for hexsha in self.cached_data:

            val = self.cached_data[hexsha]
            try:
                actor = Actor.objects.get(full_name = val['name'])
            except Actor.DoesNotExist:
                actor = Actor(full_name = val['name'])
                actor.save()
                #Create the actor

            try:
                commit = Commit.objects.get(hexsha = hexsha)
            except Commit.DoesNotExist:
                commit = Commit(hexsha = hexsha, repo = self.repo_model, actor = actor)
                commit.save()

            for path, fun in val['funcs']:
                if not Function.objects.filter(name = fun, path = path).exists():
                    fmodel = Function(name = fun, commit = commit, path = path)
                    fmodel.save()
                    print "Saved  `%s` : `%s`" % (path[-16:], fun)

            for file_name in val['files_changed']:
                FileChange(path = file_name, actor = actor, commit = commit).save()


        self.cached_data.clear()

    def walk_commits(self):
        
        #This uses a lot of memory, but...I don't see another way to go backwards
        #in git python
        commits = []
        for c in self.repo.iter_commits():
            commits.append(c)

        #pop the first commit off, so that all commits in the loop will have a parent
        commits.pop()
        
        while len(commits) > 0:
            commit = commits.pop()
            #Create a list of diffs based on the parent (aka commit before this commit)
            try:
                diffs = commit.diff(commit.parents[0])
                diffs = [d for d in diffs if self.is_python_file(d)]
                new_funcs, files_changed = self.read_diffs(diffs)
                self.store(commit, new_funcs, files_changed)
            except LookupError:
                #This seems to be a bug in PyGit maybe?
                #seems to throw this sometimes, not much we can do here...
                continue
Exemplo n.º 48
0
def get_repo_count():
	return Repo.select().count()
Exemplo n.º 49
0
    def get(self, username, reponame):
        timemap = self.get_query_argument("timemap", "false") == "true"
        index = self.get_query_argument("index", "false") == "true"
        key = self.get_query_argument("key", None)

        if (index and timemap) or (index and key) or (timemap and not key):
            raise HTTPError(400)

        if self.get_query_argument("datetime", None):
            datestr = self.get_query_argument("datetime")
            ts = date(datestr, QSDATEFMT)
        elif "Accept-Datetime" in self.request.headers:
            datestr = self.request.headers.get("Accept-Datetime")
            ts = date(datestr, RFC1123DATEFMT)
        else:
            ts = now()

        try:
            repo = (Repo.select(Repo.id).join(
                User).where((User.name == username)
                            & (Repo.name == reponame)).naive().get())
        except Repo.DoesNotExist:
            raise HTTPError(404)

        if key and not timemap:
            # Recreate the resource for the given key in its latest state -
            # if no `datetime` was provided - or in the state it was in at
            # the time indicated by the passed `datetime` argument.

            self.set_header("Content-Type", "application/n-quads")
            self.set_header("Vary", "accept-datetime")

            sha = shasum(key.encode("utf-8"))

            # Fetch all relevant changes from the last "non-delta" onwards,
            # ordered by time. The returned delta-chain consists of either:
            # a snapshot followed by 0 or more deltas, or
            # a single delete.
            chain = list(
                CSet.select(CSet.time, CSet.type).where((CSet.repo == repo) & (
                    CSet.hkey == sha) & (CSet.time <= ts) & (CSet.time >= SQL(
                        "COALESCE((SELECT time FROM cset "
                        "WHERE repo_id = %s "
                        "AND hkey_id = %s "
                        "AND time <= %s "
                        "AND type != %s "
                        "ORDER BY time DESC "
                        "LIMIT 1), 0)", repo.id, sha, ts, CSet.DELTA))).
                order_by(CSet.time).naive())

            if len(chain) == 0:
                # A resource does not exist for the given key.
                raise HTTPError(404)

            timegate_url = (self.request.protocol + "://" + self.request.host +
                            self.request.path)
            timemap_url = (self.request.protocol + "://" + self.request.host +
                           self.request.uri + "&timemap=true")

            self.set_header(
                "Link", '<%s>; rel="original"'
                ', <%s>; rel="timegate"'
                ', <%s>; rel="timemap"' % (key, timegate_url, timemap_url))

            self.set_header("Memento-Datetime",
                            chain[-1].time.strftime(RFC1123DATEFMT))

            if chain[0].type == CSet.DELETE:
                # The last change was a delete. Return a 404 response with
                # appropriate "Link" and "Memento-Datetime" headers.
                raise HTTPError(404)

            # Load the data required in order to restore the resource state.
            blobs = (Blob.select(Blob.data).where(
                (Blob.repo == repo) & (Blob.hkey == sha)
                & (Blob.time << map(lambda e: e.time, chain))).order_by(
                    Blob.time).naive())

            if len(chain) == 1:
                # Special case, where we can simply return
                # the blob data of the snapshot.
                snap = blobs.first().data
                return self.finish(decompress(snap))

            stmts = set()

            for i, blob in enumerate(blobs.iterator()):
                data = decompress(blob.data)

                if i == 0:
                    # Base snapshot for the delta chain
                    stmts.update(data.splitlines())
                else:
                    for line in data.splitlines():
                        mode, stmt = line[0], line[2:]
                        if mode == "A":
                            stmts.add(stmt)
                        else:
                            stmts.discard(stmt)

            self.write(join(stmts, "\n"))
        elif key and timemap:
            # Generate a timemap containing historic change information
            # for the requested key. The timemap is in the default link-format
            # or as JSON (http://mementoweb.org/guide/timemap-json/).

            sha = shasum(key.encode("utf-8"))

            csets = (CSet.select(
                CSet.time).where((CSet.repo == repo)
                                 & (CSet.hkey == sha)).order_by(
                                     CSet.time.desc()).naive())

            # TODO: Paginate?

            csit = csets.iterator()

            try:
                first = csit.next()
            except StopIteration:
                # Resource for given key does not exist.
                raise HTTPError(404)

            req = self.request
            base = req.protocol + "://" + req.host + req.path

            accept = self.request.headers.get("Accept", "")

            if "application/json" in accept or "*/*" in accept:
                self.set_header("Content-Type", "application/json")

                self.write('{"original_uri": ' + json_encode(key))
                self.write(', "mementos": {"list":[')

                m = ('{{"datetime": "{0}", "uri": "' + base + '?key=' +
                     url_escape(key) + '&datetime={1}"}}')

                self.write(
                    m.format(first.time.isoformat(),
                             first.time.strftime(QSDATEFMT)))

                for cs in csit:
                    self.write(', ' + m.format(cs.time.isoformat(),
                                               cs.time.strftime(QSDATEFMT)))

                self.write(']}')
                self.write('}')
            else:
                m = (',\n'
                     '<' + base + '?key=' + url_escape(key) + '&datetime={0}>'
                     '; rel="memento"'
                     '; datetime="{1}"'
                     '; type="application/n-quads"')

                self.set_header("Content-Type", "application/link-format")

                self.write('<' + key + '>; rel="original"')
                self.write(
                    m.format(first.time.strftime(QSDATEFMT),
                             first.time.strftime(RFC1123DATEFMT)))

                for cs in csit:
                    self.write(
                        m.format(cs.time.strftime(QSDATEFMT),
                                 cs.time.strftime(RFC1123DATEFMT)))
        elif index:
            # Generate an index of all URIs contained in the dataset at the
            # provided point in time or in its current state.

            self.set_header("Vary", "accept-datetime")
            self.set_header("Content-Type", "text/plain")

            page = int(self.get_query_argument("page", "1"))

            # Subquery for selecting max. time per hkey group
            mx = (CSet.select(
                CSet.hkey,
                fn.Max(CSet.time).alias("maxtime")).where(
                    (CSet.repo == repo) & (CSet.time <= ts)).group_by(
                        CSet.hkey).order_by(CSet.hkey).paginate(
                            page, INDEX_PAGE_SIZE).alias("mx"))

            # Query for all the relevant csets (those with max. time values)
            cs = (CSet.select(CSet.hkey, CSet.time).join(
                mx,
                on=((CSet.hkey == mx.c.hkey_id) & (CSet.time == mx.c.maxtime)
                    )).where((CSet.repo == repo)
                             & (CSet.type != CSet.DELETE)).alias("cs"))

            # Join with the hmap table to retrieve the plain key values
            hm = (HMap.select(HMap.val).join(
                cs, on=(HMap.sha == cs.c.hkey_id)).naive())

            for h in hm.iterator():
                self.write(h.val + "\n")
        else:
            raise HTTPError(400)
def fetch():
    sys.stdout.write('loading')
    sys.stdout.flush()
    repos = Repo.load_sample()
    authors = {author.login: author for author in Author.load(FILE)}

    seen = 0
    total = len(repos)
    failures = []
    last_write = datetime.datetime.now()

    el = Elaborator()

    for repo in repos:
        seen += 1

        if repo.username in authors:
            logging.info("already fetched %s", repo.username)
            continue

        try:
            gh_data = el._gh_request(
                'GET',
                '/users/' + repo.username
            )
        except:
            #loop really needs to keep running
            logging.exception("problem! %s", repo)
            failures.append(repo)
            continue

        authors[repo.username] = Author(**{key: gh_data.get(key, None) for key in
                                           ['login',  # "octocat"
                                            'id',  # 1
                                            'avatar_url',  # "https://github.com/images/error/octocat_happy.gif"
                                            'gravatar_id',  # "somehexcode"
                                            'url',  # "https://api.github.com/users/octocat"
                                            'name',  # "monalisa octocat"
                                            'company',  # "GitHub"
                                            'blog',  # "https://github.com/blog"
                                            'location',  # "San Francisco"
                                            'email',  # "*****@*****.**"
                                            'hireable',  # false
                                            'bio',  # "There once was..."
                                            'public_repos',  # 2
                                            'public_gists',  # 1
                                            'followers',  # 20
                                            'following',  # 0
                                            'html_url',  # "https://github.com/octocat"
                                            'created_at',  # "2008-01-14T04:33:35Z"
                                            'type',  # "User"
                                            ]})

        logging.info("fetched %s", repo.username)

        progress_bar(seen, total)

        since_write = datetime.datetime.now() - last_write

        if since_write > datetime.timedelta(minutes=5):
            sys.stdout.write("\r(writing results)")
            sys.stdout.flush()
            Author.dump(authors.values(), FILE)

            last_write = datetime.datetime.now()

    print  # from progress bar line

    if failures:
        print "%s failures:" % len(failures)
        for f in failures:
            print "  %s" % f
        print

    print 'writing out...'
    Author.dump(authors.values(), FILE)
Exemplo n.º 51
0
 def post(self):
     parsed_args = parser.parse_args()
     repo = Repo(creator=parsed_args['creator'], name=parsed_args['name'])
     session.add(repo)
     session.commit()
     return repo, 201
Exemplo n.º 52
0
    def put(self, username, reponame):
        # Create a new revision of the resource specified by `key`.

        fmt = self.request.headers.get("Content-Type", "application/n-triples")
        key = self.get_query_argument("key", None)

        if username != self.current_user.name:
            raise HTTPError(403)

        if not key:
            raise HTTPError(400)

        datestr = self.get_query_argument("datetime", None)
        ts = datestr and date(datestr, QSDATEFMT) or now()

        try:
            repo = (Repo.select(Repo.id).join(
                User).where((User.name == username)
                            & (Repo.name == reponame)).naive().get())
        except Repo.DoesNotExist:
            raise HTTPError(404)

        sha = shasum(key.encode("utf-8"))

        chain = list(
            CSet.select(CSet.time, CSet.type, CSet.len).where(
                (CSet.repo == repo) & (CSet.hkey == sha) & (CSet.time >= SQL(
                    "COALESCE((SELECT time FROM cset "
                    "WHERE repo_id = %s "
                    "AND hkey_id = %s "
                    "AND type != %s "
                    "ORDER BY time DESC "
                    "LIMIT 1), 0)", repo.id, sha, CSet.DELTA))).order_by(
                        CSet.time).naive())

        if len(chain) > 0 and not ts > chain[-1].time:
            # Appended timestamps must be monotonically increasing!
            raise HTTPError(400)

        if len(chain) == 0:
            # Mapping for `key` likely does not exist:
            # Store the SHA-to-KEY mapping in HMap,
            # looking out for possible collisions.
            try:
                HMap.create(sha=sha, val=key)
            except IntegrityError:
                val = HMap.select(HMap.val).where(HMap.sha == sha).scalar()
                if val != key:
                    raise HTTPError(500)

        # Parse and normalize into a set of N-Quad lines
        stmts = parse(self.request.body, fmt)
        snapc = compress(join(stmts, "\n"))

        if len(chain) == 0 or chain[0].type == CSet.DELETE:
            # Provide dummy value for `patch` which is never stored.
            # If we get here, we always store a snapshot later on!
            patch = ""
        else:
            # Reconstruct the previous state of the resource
            prev = set()

            blobs = (Blob.select(Blob.data).where(
                (Blob.repo == repo) & (Blob.hkey == sha)
                & (Blob.time << map(lambda e: e.time, chain))).order_by(
                    Blob.time).naive())

            for i, blob in enumerate(blobs.iterator()):
                data = decompress(blob.data)

                if i == 0:
                    # Base snapshot for the delta chain
                    prev.update(data.splitlines())
                else:
                    for line in data.splitlines():
                        mode, stmt = line[0], line[2:]
                        if mode == "A":
                            prev.add(stmt)
                        else:
                            prev.discard(stmt)

            if stmts == prev:
                # No changes, nothing to be done. Bail out.
                return self.finish()

            patch = compress(
                join(
                    map(lambda s: "D " + s, prev - stmts) +
                    map(lambda s: "A " + s, stmts - prev), "\n"))

        # Calculate the accumulated size of the delta chain including
        # the (potential) patch from the previous to the pushed state.
        acclen = reduce(lambda s, e: s + e.len, chain[1:], 0) + len(patch)

        blen = len(chain) > 0 and chain[0].len or 0  # base length

        if (len(chain) == 0 or chain[0].type == CSet.DELETE
                or len(snapc) <= len(patch) or SNAPF * blen <= acclen):
            # Store the current state as a new snapshot
            Blob.create(repo=repo, hkey=sha, time=ts, data=snapc)
            CSet.create(repo=repo,
                        hkey=sha,
                        time=ts,
                        type=CSet.SNAPSHOT,
                        len=len(snapc))
        else:
            # Store a directed delta between the previous and current state
            Blob.create(repo=repo, hkey=sha, time=ts, data=patch)
            CSet.create(repo=repo,
                        hkey=sha,
                        time=ts,
                        type=CSet.DELTA,
                        len=len(patch))