Exemplo n.º 1
0
 def _getPage(self, kwargs, prs):
     qdata = """{
       repository(owner: "%(org)s", name:"%(repository)s") {
         pullRequests(
           first: %(size)s
           %(after)s
           orderBy: { field: UPDATED_AT, direction: DESC }
         ) {
           totalCount
           pageInfo {
             hasNextPage endCursor
           }
           edges {
             node {
               %(pr_query)s
             }
           }
         }
       }
     }"""  # noqa: E501
     data = self.gql.query(qdata % kwargs)
     if "data" not in data:
         self.log.error("No data collected: %s" % data)
         if "message" in data and "wait a few minutes" in data["message"]:
             self.log.info("sleeping 2 mn")
             sleep(120)
         else:
             self.log.info("sleeping 20 s")
             sleep(20)
         return None
     if not kwargs["total_prs_count"]:
         kwargs["total_prs_count"] = data["data"]["repository"]["pullRequests"][
             "totalCount"
         ]
         self.log.info(
             "Total PRs: %s but will fetch until we reached a PR "
             "updated at date < %s"
             % (kwargs["total_prs_count"], kwargs["updated_since"])
         )
         if kwargs["total_prs_count"] == 0:
             return False
     edges = data["data"]["repository"]["pullRequests"]["edges"]
     for pr in edges:
         prs.append(pr["node"])
     # We sort to mitigate this
     # https://github.community/t5/GitHub-API-Development-and/apiv4-pullrequests-listing-broken-ordering/m-p/59439#M4968
     oldest_update = sorted(
         [is8601_to_dt(pr["node"]["updatedAt"]) for pr in edges], reverse=True
     )[-1]
     logging.info("page oldest updated at date is %s" % oldest_update)
     if oldest_update < kwargs["updated_since"]:
         # The crawler reached a page where the oldest updated PR
         # is oldest than the configured limit
         return False
     pageInfo = data["data"]["repository"]["pullRequests"]["pageInfo"]
     if pageInfo["hasNextPage"]:
         kwargs["after"] = 'after: "%s"' % pageInfo["endCursor"]
         return True
     else:
         return False
Exemplo n.º 2
0
 def get_rate_limit(self):
     ratelimit = self.getRateLimit()
     if ratelimit:
         self.quota_remain = ratelimit["remaining"]
         self.resetat = utils.is8601_to_dt(ratelimit["resetAt"])
         self.log.info("Got rate limit data: remain %s resetat %s" %
                       (self.quota_remain, self.resetat))
Exemplo n.º 3
0
def _first_event_on_changes(es, index, repository_fullname, params):
    params = deepcopy(params)

    def keyfunc(x):
        return x["change_id"]

    groups = {}
    _events = _scan(es, index, repository_fullname, params)
    _events = sorted(_events, key=lambda k: k["change_id"])
    # Keep by Change the created date + first event date
    for pr, events in groupby(_events, keyfunc):
        groups[pr] = {
            "change_created_at": None,
            "first_event_created_at": utcnow(),
            "first_event_author": None,
            "delta": None,
        }
        for event in events:
            if not groups[pr]["change_created_at"]:
                groups[pr]["change_created_at"] = is8601_to_dt(
                    event["on_created_at"])
            event_created_at = is8601_to_dt(event["created_at"])
            if event_created_at < groups[pr]["first_event_created_at"]:
                groups[pr]["first_event_created_at"] = event_created_at
                groups[pr]["delta"] = (groups[pr]["first_event_created_at"] -
                                       groups[pr]["change_created_at"])
                groups[pr]["first_event_author"] = event["author"]["muid"]
    ret = {"first_event_delay_avg": 0, "top_authors": {}}
    for pr_data in groups.values():
        ret["first_event_delay_avg"] += pr_data["delta"].seconds
        ret["top_authors"].setdefault(pr_data["first_event_author"], 0)
        ret["top_authors"][pr_data["first_event_author"]] += 1
    try:
        ret["first_event_delay_avg"] = int(ret["first_event_delay_avg"] /
                                           len(groups))
    except ZeroDivisionError:
        ret["first_event_delay_avg"] = 0
    ret["top_authors"] = sorted(
        [(k, v) for k, v in ret["top_authors"].items()],
        key=lambda x: x[1],
        reverse=True,
    )[:10]
    return ret
Exemplo n.º 4
0
def _first_event_on_changes(es, index, repository_fullname, params):
    params = deepcopy(params)

    def keyfunc(x):
        return x['change_id']

    groups = {}
    _events = _scan(es, index, repository_fullname, params)
    _events = sorted(_events, key=lambda k: k['change_id'])
    # Keep by Change the created date + first event date
    for pr, events in groupby(_events, keyfunc):
        groups[pr] = {
            'change_created_at': None,
            'first_event_created_at': utcnow(),
            'first_event_author': None,
            'delta': None,
        }
        for event in events:
            if not groups[pr]['change_created_at']:
                groups[pr]['change_created_at'] = is8601_to_dt(event['on_created_at'])
            event_created_at = is8601_to_dt(event['created_at'])
            if event_created_at < groups[pr]['first_event_created_at']:
                groups[pr]['first_event_created_at'] = event_created_at
                groups[pr]['delta'] = (
                    groups[pr]['first_event_created_at']
                    - groups[pr]['change_created_at']
                )
                groups[pr]['first_event_author'] = event['author']
    ret = {'first_event_delay_avg': 0, 'top_authors': {}}
    for pr_data in groups.values():
        ret['first_event_delay_avg'] += pr_data['delta'].seconds
        ret['top_authors'].setdefault(pr_data['first_event_author'], 0)
        ret['top_authors'][pr_data['first_event_author']] += 1
    try:
        ret['first_event_delay_avg'] = int(ret['first_event_delay_avg'] / len(groups))
    except ZeroDivisionError:
        ret['first_event_delay_avg'] = 0
    ret['top_authors'] = sorted(
        [(k, v) for k, v in ret['top_authors'].items()],
        key=lambda x: x[1],
        reverse=True,
    )[:10]
    return ret
Exemplo n.º 5
0
 def get(
     self, updated_since: str, change_id: Optional[str] = None
 ) -> List[RawChange]:
     prs: List[RawChange] = []
     updated_since = is8601_to_dt(updated_since)
     get_commits = True
     kwargs = {
         "pr_query": self.get_pr_query(include_commits=get_commits),
         "org": self.org,
         "repository": self.repository,
         "updated_since": updated_since,
         "after": "",
         "total_prs_count": 0,
         "size": self.size,
     }
     one = 0
     while True:
         self.log.info(
             "Running request %s"
             % dict([(k, v) for k, v in kwargs.items() if k != "pr_query"])
         )
         try:
             hnp = self._getPage(kwargs, prs)
             if kwargs["size"] == 1:
                 self.log.debug("Getting this PR, with page size 1: %s" % prs[0])
             kwargs["size"] = min(MAX_BULK_SIZE, int(kwargs["size"] * AUGMENT) + 1)
             one = 0
             if not get_commits:
                 self.log.info("Will get full commits on next query.")
                 kwargs["pr_query"] = self.get_pr_query(include_commits=get_commits)
                 get_commits = True
         except RequestTimeout:
             kwargs["size"] = max(1, kwargs["size"] // REDUCE)
             if kwargs["size"] == 1:
                 one += 1
                 if one == MAX_TRY - 1:
                     self.log.info(
                         "%d timeouts in a raw for one pr, retrying without commits."
                         % (MAX_TRY - 1)
                     )
                     get_commits = False
                     kwargs["pr_query"] = self.get_pr_query(
                         include_commits=get_commits
                     )
                 elif one >= MAX_TRY:
                     self.log.info(
                         "%d timeouts in a raw for one pr, giving up." % MAX_TRY
                     )
                     raise
             continue
         self.log.info("%s PRs fetched" % len(prs))
         if hnp is False:
             break
     return prs
Exemplo n.º 6
0
def ensure_gte_lte(es, index, repository_fullname, params):
    if not params.get("gte"):
        first_created_event = _first_created_event(es, index,
                                                   repository_fullname, params)
        if first_created_event:
            params["gte"] = int(
                is8601_to_dt(first_created_event).timestamp() * 1000)
        else:
            # There is probably nothing in the db that match the query
            params["gte"] = None
    if not params.get("lte"):
        params["lte"] = int(utcnow().timestamp() * 1000)
Exemplo n.º 7
0
 def get(self, updated_since):
     prs = []
     updated_since = is8601_to_dt(updated_since)
     get_commits = True
     kwargs = {
         'pr_query': self.get_pr_query(include_commits=get_commits),
         'org': self.org,
         'repository': self.repository,
         'updated_since': updated_since,
         'after': '',
         'total_prs_count': 0,
         'size': self.size,
     }
     one = 0
     while True:
         self.log.info(
             'Running request %s' %
             dict([(k, v) for k, v in kwargs.items() if k != 'pr_query']))
         try:
             hnp = self._getPage(kwargs, prs)
             if kwargs['size'] == 1:
                 self.log.debug('Getting this PR, with page size 1: %s' %
                                prs[0])
             kwargs['size'] = min(MAX_BULK_SIZE,
                                  int(kwargs['size'] * AUGMENT) + 1)
             one = 0
             if not get_commits:
                 self.log.info('Will get full commits on next query.')
                 kwargs['pr_query'] = self.get_pr_query(
                     include_commits=get_commits)
                 get_commits = True
         except RequestTimeout:
             kwargs['size'] = max(1, kwargs['size'] // REDUCE)
             if kwargs['size'] == 1:
                 one += 1
                 if one == MAX_TRY - 1:
                     self.log.info(
                         '%d timeouts in a raw for one pr, retrying without commits.'
                         % (MAX_TRY - 1))
                     get_commits = False
                     kwargs['pr_query'] = self.get_pr_query(
                         include_commits=get_commits)
                 elif one >= MAX_TRY:
                     self.log.info(
                         '%d timeouts in a raw for one pr, giving up.' %
                         MAX_TRY)
                     raise
             continue
         self.log.info("%s PRs fetched" % len(prs))
         if hnp is False:
             break
     return prs
Exemplo n.º 8
0
 def get(self,
         updated_since: str,
         change_id: Optional[str] = None) -> List[RawChange]:
     if not change_id:
         request_params = "?q=after:%s+project:%s" % (
             utils.is8601_to_dt(updated_since).strftime("%Y-%m-%d"),
             self.repository_prefix,
         )
     else:
         request_params = "?q=change:%s" % change_id
     for option in [
             "MESSAGES",
             "DETAILED_ACCOUNTS",
             "DETAILED_LABELS",
             "CURRENT_REVISION",
             "CURRENT_FILES",
             "CURRENT_COMMIT",
     ]:
         request_params += "&o=%s" % option
     count = 100
     start_after = 0
     reviews = []
     while True:
         urlpath = (self.base_url + "/changes/" + request_params +
                    "&n=%s&start=%s" % (count, start_after))
         self.log.info("query: %s" % urlpath)
         try:
             response = requests.get(urlpath,
                                     verify=not self.insecure,
                                     auth=self.auth)
             response.raise_for_status()
         except Exception:
             self.log.exception(
                 "Unable to process the Gerrit query request")
             break
         _reviewes = json.loads(response.text[4:])
         if _reviewes:
             reviews.extend(_reviewes)
             self.log.info("read %s reviews from the api" % len(reviews))
             if reviews[-1].get("_more_changes"):
                 start_after = len(reviews)
             else:
                 break
         else:
             break
     if self.prefix:
         for review in reviews:
             review["project"] = self.prefix + review["project"]
     return reviews
Exemplo n.º 9
0
 def get(self, updated_since, change=None):
     if not change:
         request_params = "?q=after:%s+project:%s" % (
             utils.is8601_to_dt(updated_since).strftime("%Y-%m-%d"),
             self.repository_prefix,
         )
     else:
         request_params = "?q=change:%s" % change
     for option in [
         'MESSAGES',
         'DETAILED_ACCOUNTS',
         'DETAILED_LABELS',
         'CURRENT_REVISION',
         'CURRENT_FILES',
         'CURRENT_COMMIT',
     ]:
         request_params += '&o=%s' % option
     count = 100
     start_after = 0
     reviews = []
     while True:
         urlpath = (
             self.base_url
             + '/changes/'
             + request_params
             + '&n=%s&start=%s' % (count, start_after)
         )
         self.log.info("query: %s" % urlpath)
         try:
             response = requests.get(
                 urlpath, verify=not self.insecure, auth=self.auth
             )
             response.raise_for_status()
         except Exception:
             self.log.exception('Unable to process the Gerrit query request')
             break
         _reviewes = json.loads(response.text[4:])
         if _reviewes:
             reviews.extend(_reviewes)
             self.log.info("read %s reviews from the api" % len(reviews))
             if reviews[-1].get('_more_changes'):
                 start_after = len(reviews)
             else:
                 break
         else:
             break
     return reviews
Exemplo n.º 10
0
def cold_changes(es, index, repository_fullname, params):
    params = deepcopy(params)
    size = params.get("size")
    params["etype"] = ("Change",)
    params["state"] = ("OPEN",)
    changes = _scan(es, index, repository_fullname, params)
    _changes_ids = set([change["change_id"] for change in changes])
    params["etype"] = ("ChangeCommentedEvent", "ChangeReviewedEvent")
    del params["state"]
    events = _scan(es, index, repository_fullname, params)
    _events_ids = set([event["change_id"] for event in events])
    changes_ids_wo_rc = _changes_ids.difference(_events_ids)
    changes_wo_rc = [
        change for change in changes if change["change_id"] in changes_ids_wo_rc
    ]
    changes_wo_rc = enhance_changes(changes_wo_rc)
    items = sorted(changes_wo_rc, key=lambda x: is8601_to_dt(x["created_at"]))
    if size:
        items = items[:size]
    return {"items": items}
Exemplo n.º 11
0
def cold_changes(es, index, repository_fullname, params):
    params = deepcopy(params)
    size = params.get('size')
    params['etype'] = ('Change',)
    params['state'] = 'OPEN'
    changes = _scan(es, index, repository_fullname, params)
    _changes_ids = set([change['change_id'] for change in changes])
    params['etype'] = ('ChangeCommentedEvent', 'ChangeReviewedEvent')
    del params['state']
    events = _scan(es, index, repository_fullname, params)
    _events_ids = set([event['change_id'] for event in events])
    changes_ids_wo_rc = _changes_ids.difference(_events_ids)
    changes_wo_rc = [
        change for change in changes if change['change_id'] in changes_ids_wo_rc
    ]
    changes_wo_rc = enhance_changes(changes_wo_rc)
    items = sorted(changes_wo_rc, key=lambda x: is8601_to_dt(x['created_at']))
    if size:
        items = items[:size]
    return {'items': items}
Exemplo n.º 12
0
 def timedelta(start, end):
     start = utils.is8601_to_dt(start)
     end = utils.is8601_to_dt(end)
     return int((start - end).total_seconds())