def get_authors(self, mails=[], repos=[], fromdate=None, todate=None, merge_commit=None, metadata=[], mails_neg=False, domains=None, blacklisted_mails=None): """ Return the author emails (removed duplicated) also this return the amount of hits for a given unique author_email. The hits value is the amount of commits for a given email. """ params = {'index': self.index, 'doc_type': self.dbname} body = { "query": { "bool": { "filter": self.get_filter(mails, repos, metadata, mails_neg, domains, blacklisted_mails), } }, "aggs": { "authors": { "terms": { "field": "author_email", "order": { "_count": "desc" }, "size": 1000000 } } } } body["query"]["bool"]["filter"]["bool"]["must"].append( {"range": { "committer_date": { "gte": fromdate, "lt": todate, } }}) if merge_commit is not None: body["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { "merge_commit": merge_commit }}) params['body'] = body params['size'] = 0 params = clean_empty(params) res = self.es.search(**params) took = res['took'] res = [(b['key'], b['doc_count']) for b in res["aggregations"]["authors"]["buckets"]] return took, dict(res)
def get_tags(self, repos, fromdate=None, todate=None): qfilter = { "bool": { "must": [], "should": [], } } for repo in repos: should_repo_clause = {"bool": {"must": []}} should_repo_clause["bool"]["must"].append({"term": {"repo": repo}}) qfilter["bool"]["should"].append(should_repo_clause) qfilter["bool"]["must"].append( {"range": { "date": { "gte": fromdate, "lt": todate, } }}) body = {"query": {"bool": {"filter": qfilter}}} body = clean_empty(body) return [ t for t in scanner( self.es, query=body, index=self.index, doc_type=self.dbname) ]
def get_field_stats(self, field, mails=[], repos=[], fromdate=None, todate=None, merge_commit=None, metadata=[], mails_neg=False, domains=None, blacklisted_mails=None): """ Return the stats about the specified field for authors and/or repos. """ params = {'index': self.index, 'doc_type': self.dbname} body = { "query": { "bool": { "filter": self.get_filter(mails, repos, metadata, mails_neg, domains, blacklisted_mails), } }, "aggs": { "%s_stats" % field: { "stats": { "field": field } } } } body["query"]["bool"]["filter"]["bool"]["must"].append( {"range": { "committer_date": { "gte": fromdate, "lt": todate, } }}) if merge_commit is not None: body["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { "merge_commit": merge_commit }}) params['body'] = body params['size'] = 0 params = clean_empty(params) res = self.es.search(**params) took = res['took'] return took, res["aggregations"]["%s_stats" % field]
def subreq(mails): request = [] for email in mails: req_head = {'index': self.index, 'type': self.dbname} req_body = { 'query': { 'term': { 'author_email': email } }, 'size': 1, '_source': ["author_email", "author_name"] } request.extend([req_head, req_body]) request = clean_empty(request) resp = self.es.msearch(body=request) return resp
def get_commits_amount(self, mails=[], repos=[], fromdate=None, todate=None, merge_commit=None, metadata=[], mails_neg=False, domains=None, blacklisted_mails=None): """ Return the amount of commits for authors and/or repos. """ params = {'index': self.index, 'doc_type': self.dbname} body = { "query": { "bool": { "filter": self.get_filter(mails, repos, metadata, mails_neg, domains, blacklisted_mails), } } } body["query"]["bool"]["filter"]["bool"]["must"].append( {"range": { "committer_date": { "gte": fromdate, "lt": todate, } }}) if merge_commit is not None: body["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { "merge_commit": merge_commit }}) params['body'] = body params = clean_empty(params) res = self.es.count(**params) return res['count']
def get_authors_histo(self, mails=[], repos=[], fromdate=None, todate=None, merge_commit=None, metadata=[], mails_neg=False, domains=None, blacklisted_mails=None): """ Return the histogram of authors for authors and/or repos. """ params = {'index': self.index, 'doc_type': self.dbname} qfilter = self.get_filter(mails, repos, metadata, mails_neg, domains, blacklisted_mails) duration = self.get_commits_time_delta( mails, repos, fromdate=fromdate, todate=todate, metadata=metadata, mails_neg=mails_neg, domains=domains, blacklisted_mails=blacklisted_mails)[2] res = self.set_histo_granularity(duration) body = { "query": { "bool": { "filter": qfilter, } }, "aggs": { "commits": { "date_histogram": { "field": "committer_date", "interval": res, "format": "yyyy-MM-dd", }, "aggs": { "authors_email": { "terms": { "field": "author_email", "size": 1000000 }, } } } } } body["query"]["bool"]["filter"]["bool"]["must"].append( {"range": { "committer_date": { "gte": fromdate, "lt": todate, } }}) if merge_commit is not None: body["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { "merge_commit": merge_commit }}) params['body'] = body params['size'] = 0 params = clean_empty(params) res = self.es.search(**params) took = res['took'] res = res["aggregations"]["commits"]["buckets"] for bucket in res: bucket['authors_email'] = [ b['key'] for b in bucket['authors_email']['buckets'] ] bucket['doc_count'] = len(bucket['authors_email']) return took, res
def get_top_field_by_lines(self, field, mails=[], repos=[], fromdate=None, todate=None, merge_commit=None, metadata=[], mails_neg=False, domains=None, blacklisted_mails=None): """ Return the ranking of field by lines changed """ params = {'index': self.index, 'doc_type': self.dbname} body = { "query": { "bool": { "filter": self.get_filter(mails, repos, metadata, mails_neg, domains, blacklisted_mails), } }, "aggs": { "top-field-by-modified": { "terms": { "field": field, "size": 1000000, }, "aggs": { "modified": { "sum": { "field": "line_modifieds", }, } } } } } body["query"]["bool"]["filter"]["bool"]["must"].append( {"range": { "committer_date": { "gte": fromdate, "lt": todate, } }}) if merge_commit is not None: body["query"]["bool"]["filter"]["bool"]["must"].append( {"term": { "merge_commit": merge_commit }}) params['body'] = body params['size'] = 0 params = clean_empty(params) res = self.es.search(**params) took = res['took'] top = [(b['key'], b['modified']['value']) for b in res["aggregations"]["top-field-by-modified"]["buckets"] ] return took, dict(top)
def get_commits(self, mails=[], repos=[], fromdate=None, todate=None, start=0, limit=100, sort='desc', scan=False, merge_commit=None, metadata=[], mails_neg=False, domains=None, blacklisted_mails=None): """ Return the list of commits for authors and/or repos. """ params = {'index': self.index, 'doc_type': self.dbname} qfilter = self.get_filter(mails, repos, metadata, mails_neg, domains, blacklisted_mails) # If None both are returned. If you expect to skip merge commits # then set merge_commit to False if merge_commit is not None: qfilter["bool"]["must"].append( {"term": { "merge_commit": merge_commit }}) qfilter["bool"]["must"].append( {"range": { "committer_date": { "gte": fromdate, "lt": todate, } }}) body = { "query": { "bool": { "filter": qfilter, } } } if scan: return scanner(self.es, query=body, index=self.index, doc_type=self.dbname) params['body'] = body params['size'] = limit params['from_'] = start params['sort'] = "committer_date:%s,author_date:%s" % (sort, sort) params = clean_empty(params) res = self.es.search(**params) took = res['took'] hits = res['hits']['total'] if isinstance(hits, dict) and 'value' in hits: hits = hits.get('value') commits = [r['_source'] for r in res['hits']['hits']] return took, hits, commits
def get_filter(self, mails, repos, metadata, mails_neg=False, domains=None, blacklisted_mails=None): """ Compute the search filter """ if isinstance(mails, list): mails = dict([(mail, None) for mail in mails]) if isinstance(repos, list): repos = dict([(repo, None) for repo in repos]) if not domains: domains = [] filter = { "bool": { "must": [], "must_not": [], } } must_mail_clause = {"bool": {"should": [], "must_not": []}} for mail, date_bounces in mails.items(): must = {"bool": {"must": []}} must["bool"]["must"].append({"term": {"author_email": mail}}) if date_bounces: date_clause = { "range": { "committer_date": { "gte": date_bounces.get('begin-date'), "lt": date_bounces.get('end-date') } } } must["bool"]["must"].append(date_clause) if mails_neg: must_mail_clause["bool"]["must_not"].append(must) else: must_mail_clause["bool"]["should"].append(must) for domain in domains: clause = {"bool": {"must": []}} clause["bool"]["must"].append( {"term": { "author_email_domain": domain }}) if mails_neg: must_mail_clause["bool"]["must_not"].append(clause) else: must_mail_clause["bool"]["should"].append(clause) filter["bool"]["must"].append(must_mail_clause) must_project_clause = {"bool": {"should": []}} for repo, paths in repos.items(): repo_clause = {"bool": {"must": []}} repo_clause["bool"]["must"].append({ "bool": { "must": [{ "term": { "repos": repo } }], "should": [], "filter": [], } }) if paths: repo_clause["bool"]["must"][0]["bool"]["filter"].append( {"terms": { "files_list": [] }}) for path in paths: repo_clause["bool"]["must"][0]["bool"]["filter"][0][ "terms"]["files_list"].append(path) must_project_clause["bool"]["should"].append(repo_clause) filter["bool"]["must"].append(must_project_clause) must_metadata_clause = {"bool": {"should": []}} for key, value in metadata: if value is None: must_metadata_clause["bool"]["should"].append( {"exists": { "field": key }}) else: must_metadata_clause["bool"]["should"].append( {"term": { key: value }}) filter["bool"]["must"].append(must_metadata_clause) # Exclude commits from 1970-01-01 boggus_date_clause = { "range": { "committer_date": { "gte": 86401, } } } filter["bool"]["must"].append(boggus_date_clause) if blacklisted_mails: for mail in blacklisted_mails: filter["bool"]["must_not"].append( {"term": { "author_email": mail }}) return clean_empty(filter)