def _set_data(self): self.data = CompositeData()
def _set_data(self): # self.data = CheckstyleData(self.project, self.version) self.data = CompositeData()
class ProcessExtractor(Extractor): def __init__(self, project: Project, version, repo=None): super().__init__( "ProcessExtractor", project, version, [DataType.ProcessFilesDataType, DataType.IssuesFilesDataType], repo) def _set_data(self): self.data = CompositeData() def clean(self, s): return "".join(list(filter(lambda c: c.isalpha(), s))) def _extract(self): # get version_date from apache_versions config = Config().config repository_data = config["CACHING"]["RepositoryData"] path = os.path.join(repository_data, config['DATA_EXTRACTION']["Versions"], self.project.github(), self.project.jira() + ".csv") df = pd.read_csv(path, sep=';') version_date = df[df['version_name'] == self.version]['version_date'].to_list()[0] version_date = datetime.strptime(version_date, '%Y-%m-%d %H:%M:%S') # get file list from committed_files path = os.path.join(repository_data, config['DATA_EXTRACTION']["CommittedFiles"], self.project.github(), self.project.jira() + ".csv") df = pd.read_csv(path, sep=';') issues_path = os.path.join(repository_data, config['DATA_EXTRACTION']["Issues"], self.project.github(), self.project.jira() + "_dummies.csv") issues_df = pd.read_csv(issues_path, sep=';') issues_df = df[['commit_id', 'issue_id']].merge(issues_df, on=['issue_id'], how='right') # filter commits after version date df = df[df.apply(lambda r: datetime.strptime(r[ 'commit_date'], '%Y-%m-%d %H:%M:%S') < version_date, axis=1)] # split by file_name data = {} issues_data = {} extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) files = pd.read_csv(path, sep=';')['file_name'].to_list() df = df[df.apply(lambda r: r['file_name'].endswith('.java') and r[ 'file_name'] in files, axis=1)] for file_name, file_df in df.groupby('file_name', as_index=False): data[file_name] = self._extract_process_features(file_df) issues_data[file_name] = self._extract_issues_features( file_df, issues_df, self._get_blame_data(file_name)) # extract the following features: self.data.add(ProcessData(self.project, self.version, data=data)).add( IssuesData(self.project, self.version, data=issues_data)) def _get_blame_data(self, file_name): repo = git.Repo(self.local_path) version_names = list(map(lambda x: x.name, repo.tags)) version = self.version if version not in version_names: if '\\' in version: if version.replace('\\', '/') in version_names: version = version.replace('\\', '/') if '/' in version: if version.replace('/', '\\') in version_names: version = version.replace('/', '\\') blame = repo.blame(version, file_name) blame = reduce( list.__add__, map(lambda x: list(map(lambda y: (x[0], y), x[1])), blame), []) commits, source_code = list(zip(*blame)) lines = CommentFilter().filterComments(source_code)[0] values = [] for c, l in zip(commits, lines): d = {"commit_id": c.hexsha} for k, v in l.getValuesVector().items(): d['blame_' + k] = v values.append(d) return pd.DataFrame(values) def _get_blame_for_file(self, file_name): ans = {} repo = git.Repo(self.local_path) blame = repo.blame(self.version, file_name) ans['blobs'] = len(blame) blame = reduce( list.__add__, map(lambda x: list(map(lambda y: (x[0], y), x[1])), blame), []) commits, source_code = list(zip(*blame)) ans['blame_commits'] = len(set(commits)) c = dict(Counter(list(map(lambda x: x.hexsha, commits)))) list( map(lambda x: c.update({x: c[x] / ans['blame_commits']}), c.keys())) for k, v in pd.DataFrame( c.values(), columns=['col']).describe().to_dict()['col'].items(): ans['blame_' + k] = v lines = CommentFilter().filterComments(source_code)[0] commits_lines = list(filter(lambda x: x[1], zip(commits, lines))) filtered_commits, _ = set(zip(*commits_lines)) c = dict(Counter(list(map(lambda x: x.hexsha, filtered_commits)))) ans['blame_filtered_commits'] = len(set(zip(*commits_lines))) list( map(lambda x: c.update({x: c[x] / ans['blame_filtered_commits']}), c.keys())) for k, v in pd.DataFrame( c.values(), columns=['col']).describe().to_dict()['col'].items(): ans['blame_filtered_' + k] = v values = [] for c in set(filtered_commits): l = list( map(lambda l: l[1], filter(lambda l: l[0] == c, commits_lines))) h = Halstead(l).getValuesVector() values.append(h) for col, d in pd.DataFrame(values).describe().to_dict().items(): for k, v in d.items(): ans['blame_halstead_' + col + "_" + k] = v return ans def _get_features(self, d, initial=''): ans = {initial + "_count": d.shape[0]} des = d.describe() des = des.drop(['25%', '50%', '75%']) for col in des: for metric in des.index.to_list(): # set default value ans["_".join([initial, col, metric])] = 0.0 for col in des: for k, v in des[col].to_dict().items(): if v and not math.isnan(v): ans["_".join( [self.clean(initial), self.clean(col), self.clean(k)])] = v return ans def _extract_process_features(self, df): df = df.drop([ 'file_name', 'is_java', 'commit_id', 'commit_date', 'commit_url', 'bug_url' ], axis=1) ans = {} ans.update( self._get_features(df.drop('issue_id', axis=1), "all_process")) return ans def _extract_issues_features(self, df, issues_df, blame): ans = {} # d = df[['commit_id', 'issue_id']] # blame_merge = d.merge(blame, on=['commit_id'], how='right') blame_merge = blame.merge(issues_df, on=['commit_id'], how='left') blame_merge = blame_merge.drop(['commit_id', 'issue_id'], axis=1) ans.update(self._get_features(blame_merge, "blame_merge")) df = df.drop([ 'file_name', 'is_java', 'commit_id', 'commit_date', 'commit_url', 'bug_url' ], axis=1) ans.update( self._get_features( df[df['issue_id'] != '0'].drop('issue_id', axis=1), "fixes")) ans.update( self._get_features( df[df['issue_id'] == '0'].drop('issue_id', axis=1), "non_fixes")) merged = df.merge(issues_df.drop(['commit_id'], axis=1), on=['issue_id'], how='inner') merged = merged.drop(['key', 'issue_id'], axis=1) ans.update(self._get_features(merged, 'issues')) # for dummy in dummies_dict: # # percent # for d in dummies_dict[dummy]: # ans.update(self._get_features(merged[merged[d] == 1].drop(d, axis=1), d)) # ans.update(self._get_features(blame_merge[blame_merge[d] == 1], d)) return ans
class Checkstyle(Extractor): def __init__(self, project: Project, version, repo=None): super().__init__("Checkstyle", project, version, [ DataType.CheckstyleFileDataType, DataType.CheckstyleMethodDataType ], repo) self.out_path_to_xml = os.path.normpath( Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['TEMP']['Checkstyle']))) def _set_data(self): # self.data = CheckstyleData(self.project, self.version) self.data = CompositeData() def _extract(self): all_checks_xml = self._get_all_checks_xml(self.config) self._execute_command(self.runner, all_checks_xml, self.local_path, self.out_path_to_xml.replace("\\\\?\\", "")) checkstyle_files, checkstyle_methods = self._process_checkstyle_data( self.out_path_to_xml) self.data.add(CheckstyleFileData(self.project, self.version, data=checkstyle_files))\ .add(CheckstyleMethodData(self.project, self.version, data=checkstyle_methods)) @staticmethod def _get_all_checks_xml(config): externals_path = config['EXTERNALS']['BaseDir'] all_checks_xml_name = config['EXTERNALS']['AllChecks'] externals = Config.get_work_dir_path(externals_path) all_checks_xml = os.path.join(externals, all_checks_xml_name) return all_checks_xml @staticmethod def _execute_command(checkstyle_runner: str, all_checks_xml: str, local_path: str, out_path_to_xml: str) -> str: commands = [ "java", "-jar", checkstyle_runner, "-c", all_checks_xml, "-f", "xml", "-o", out_path_to_xml.replace("\\\\?\\", ""), local_path ] execute_timeout(commands) return out_path_to_xml def _process_checkstyle_data(self, out_path_to_xml): checkstyle_methods = {} checkstyle_files = {} methods_keys = set() files_keys = set() methods_ = {} files_ = {} with open(out_path_to_xml, "r", encoding="utf-8") as file: root = ElementTree.parse(file).getroot() for file_element in root: try: filepath = file_element.attrib['name'].lower() except: continue if not filepath.endswith(".java"): continue files_, files_keys, methods_, methods_keys = self._get_items( file_element, os.path.realpath(filepath), files_, files_keys, methods_, methods_keys) for method_id in methods_: checkstyle_methods[method_id] = dict.fromkeys(methods_keys, 0) checkstyle_methods[method_id].update(methods_[method_id]) for file_id in files_: checkstyle_files[file_id] = dict.fromkeys(files_keys, 0) checkstyle_files[file_id].update(files_[file_id]) return checkstyle_files, checkstyle_methods def _get_items(self, file_element, file_path, files_, files_keys, methods_, methods_keys): for errorElement in file_element: line = int(errorElement.attrib['line']) if "max allowed" not in errorElement.attrib['message']: continue key = "_".join(errorElement.attrib['message'] \ .replace("lines", "") \ .replace(",", "") \ .split('(')[0] \ .split()[:-2]) value = int(errorElement.attrib['message'] \ .replace("lines", "") \ .replace(",", "") \ .split('(')[0] \ .split()[-1] \ .strip()) method_id = self.file_analyser.get_closest_id(file_path, line) if method_id: if "npath" in key.lower(): value = min(10000, int(value)) methods_.setdefault(method_id, dict())[key] = value methods_keys.add(key) else: files_.setdefault(file_path.lower(), dict())[key] = value files_keys.add(key) return files_, files_keys, methods_, methods_keys