def _extract(self): extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) df = pd.read_csv(path, sep=';') key = 'file_name' assert key in df.columns bugged = df.groupby(key).apply( lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict() self.data.set_raw_data(bugged)
def _extract(self): # get version_date from apache_versions config = Config().config repository_data = config["CACHING"]["RepositoryData"] path = os.path.join(repository_data, config['DATA_EXTRACTION']["AllVersions"], self.project.github_name, self.project.github_name + ".csv") df = pd.read_csv(path, sep=';') version_date = df[df['version_name'] == self.version]['version_date'].to_list()[0] version_date = datetime.strptime(version_date, '%Y-%m-%d %H:%M:%S') # get file list from committed_files path = os.path.join(repository_data, config['DATA_EXTRACTION']["CommittedFiles"], self.project.github_name, self.project.github_name + ".csv") df = pd.read_csv(path, sep=';') issues_path = os.path.join(repository_data, config['DATA_EXTRACTION']["Issues"], self.project.github_name, self.project.github_name + "_dummies.csv") issues_df = pd.read_csv(issues_path, sep=';') issues_df = df[['commit_id', 'issue_id']].merge(issues_df, on=['issue_id'], how='right') # filter commits after version date df = df[df.apply(lambda r: datetime.strptime(r[ 'commit_date'], '%Y-%m-%d %H:%M:%S') < version_date, axis=1)] # split by file_name data = {} issues_data = {} extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) files = pd.read_csv(path, sep=';')['file_name'].to_list() df = df[df.apply(lambda r: r['file_name'].endswith('.java') and r[ 'file_name'] in files, axis=1)] for file_name, file_df in df.groupby('file_name', as_index=False): norm_name = os.path.normpath(file_name).lower() if norm_name not in self.file_analyser.relative_paths: continue name = self.file_analyser.relative_paths[norm_name] data[name] = self._extract_process_features(file_df) issues_data[name] = self._extract_issues_features( file_df, issues_df, self._get_blame_data(file_name)) # extract the following features: self.data.add(ProcessData(self.project, self.version, data=data)).add( IssuesProductData(self.project, self.version, data=issues_data)).add( IssuesProcessData(self.project, self.version, data=issues_data))
def _extract(self): extractor = DataExtractor(self.project) path = extractor.get_bugged_files_path(self.version, True) df = pd.read_csv(path, sep=';') key = 'file_name' assert key in df.columns bugged = df.groupby(key).apply( lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict() ans = dict() for name, value in bugged.items(): norm_name = os.path.normpath(name.lower()) if norm_name in self.file_analyser.relative_paths: ans[self.file_analyser.relative_paths[norm_name]] = value self.data.set_raw_data(ans)