def search_files(patt, file_patt, repo_dir): cmd = [RG_CMD] if file_patt: cmd.extend(['-g', file_patt]) cmd.append(patt) proc_res = run_cap(cmd, cwd=repo_dir) return proc_res.stdout
def _get_py_version(plist, project, repo_dir): # -v flag required because it improves the correctness of detection # manual tests on the calibre package showed better results with -v cmd = [VERSION_CMD, '-v', repo_dir] proc_res = run_cap(cmd, cwd=repo_dir) # hardcoded strings to search output lines for MIN_VERSION_PATTERN = '^Minimum required versions:' INCOMP_VERSION_PATTERN = '^Incompatible versions:' lines = proc_res.stdout.splitlines() min_version_line = None incomp_version_line = None # with -v flag a lot of output is generated, the lines containing # version information will probably be at the bottom for line in lines[-5:]: if re.match(MIN_VERSION_PATTERN, line): min_version_line = line if re.match(INCOMP_VERSION_PATTERN, line): incomp_version_line = line # None indicates not compatible with that major version of python py_2_version = None py_3_version = None if min_version_line is not None: supp_versions = min_version_line[26:].split() for version in supp_versions: if version[0] == '2': py_2_version = version.strip(',') elif version[0] == '3': py_3_version = version.strip(',') return {'min_py2': py_2_version, 'min_py3': py_3_version}
def collect(plist, project, repo_dir): ret = {} proc_res = run_cap([DETECT_CMD, repo_dir, '--format', 'json']) output_json = json.loads(proc_res.stdout) possible_licenses = glom.glom(output_json, '0.matches', default=[]) # sort and set into descending order possible_licenses = sorted(possible_licenses, key=lambda x: x['confidence'], reverse=True)[:3] norm_licenses = [] for pl in possible_licenses: if pl['confidence'] < 0.9: continue elif pl['license'] not in LICENSE_MAP: continue norm_licenses.append((LICENSE_MAP[pl['license']], round(pl['confidence'], 3))) if not norm_licenses or len(norm_licenses) > 3: ret['license'] = 'Other' # not enough consensus on a known license else: sorted(norm_licenses, key=lambda x: x[1], reverse=True) if len(norm_licenses) < 3: ret['license'] = norm_licenses[0][0] else: most_common = Counter([x[0] for x in norm_licenses]).most_common(1)[0][0] ret['license'] = most_common group = re.split('\W+', ret['license'])[0] ret['license_group'] = group ret['hereditary'] = GROUP_HEREDITARY_MAP.get(group) return ret
def get_git_info(repo_dir): ret = {} proc_res = run_cap(['git', 'rev-list', '--max-parents=0', 'HEAD'], cwd=repo_dir) first_commit_hashes = proc_res.stdout.strip().split() first_commit_dt = sorted([_get_commit_dt(repo_dir, fch) for fch in first_commit_hashes])[0] proc_res = run_cap(['git', 'rev-parse', 'HEAD'], cwd=repo_dir) latest_commit_hash = proc_res.stdout.strip() latest_commit_dt = _get_commit_dt(repo_dir, latest_commit_hash) ret['first_commit'] = first_commit_dt.isoformat() ret['latest_commit'] = latest_commit_dt.isoformat() proc_res = run_cap(['git', 'shortlog', '--summary', '--numbered', '--email'], cwd=repo_dir) committer_registry = CommitterRegistry() for match in _git_committer_re.finditer(proc_res.stdout): gdict = match.groupdict() gdict['commit_count'] = int(gdict['commit_count']) committer_registry.register(gdict['name'], gdict['email'], gdict['commit_count']) committers = committer_registry.get_committers() ret['commit_count'] = commit_count = sum([c.commit_count for c in committers]) ret['committer_count'] = len(committers) # redundant with committer_percent_dist.100 # these will be stored as percentages, so keep it to two-digit precision max threshes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0] commit_thresh_map = {thresh: (commit_count * thresh) for thresh in threshes} sorted_committers = sorted([(c, c.commit_count) for c in committers], reverse=True, key=lambda x: x[1]) def _get_proportion_count(thresh_commit_count): _cur_commit_count = 0 _cur_committer_count = 0 for committer, committer_commit_count in sorted_committers: if _cur_commit_count > thresh_commit_count: break _cur_commit_count += committer_commit_count _cur_committer_count += 1 return _cur_committer_count # how many developers' commits does it take to comprise XX% of the commits? committer_dist_map = {round(thresh * 100): _get_proportion_count(thresh_commit_count) for thresh, thresh_commit_count in commit_thresh_map.items()} ret['committer_percent_dist'] = committer_dist_map ret['committer_top_5'] = [round(c / commit_count, 4) for _, c in sorted_committers][:5] ret['minor_committer_counts'] = {x: len([c for _, c in sorted_committers if c <= x]) for x in range(1, 6)} ''' # DEBUG print(first_commit_dt.isoformat(), latest_commit_dt.isoformat(), latest_commit_dt - first_commit_dt) from pprint import pprint pprint(committer_dist_map) pprint(ret['top_5']) pprint(ret) raise SystemExit # quits after the first ''' return ret
def _get_commit_dt(repo_dir, commit_hash, **kw): kw.setdefault('env', {})['TZ'] = 'UTC' kw['cwd'] = repo_dir proc_res = run_cap(['git', 'show', '-s', '--format=%cd', '--date=format-local:%Y-%m-%dT%H:%M:%S', commit_hash], **kw) date_text = proc_res.stdout.strip() return isoparse(date_text)