Пример #1
0
 def _extract(self):
     extractor = DataExtractor(self.project)
     path = extractor.get_bugged_files_path(self.version, True)
     df = pd.read_csv(path, sep=';')
     key = 'file_name'
     assert key in df.columns
     bugged = df.groupby(key).apply(
         lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict()
     self.data.set_raw_data(bugged)
Пример #2
0
    def _extract(self):
        # get version_date from apache_versions
        config = Config().config
        repository_data = config["CACHING"]["RepositoryData"]
        path = os.path.join(repository_data,
                            config['DATA_EXTRACTION']["AllVersions"],
                            self.project.github_name,
                            self.project.github_name + ".csv")
        df = pd.read_csv(path, sep=';')
        version_date = df[df['version_name'] ==
                          self.version]['version_date'].to_list()[0]
        version_date = datetime.strptime(version_date, '%Y-%m-%d %H:%M:%S')
        # get file list from committed_files
        path = os.path.join(repository_data,
                            config['DATA_EXTRACTION']["CommittedFiles"],
                            self.project.github_name,
                            self.project.github_name + ".csv")
        df = pd.read_csv(path, sep=';')
        issues_path = os.path.join(repository_data,
                                   config['DATA_EXTRACTION']["Issues"],
                                   self.project.github_name,
                                   self.project.github_name + "_dummies.csv")
        issues_df = pd.read_csv(issues_path, sep=';')
        issues_df = df[['commit_id', 'issue_id']].merge(issues_df,
                                                        on=['issue_id'],
                                                        how='right')
        # filter commits after version date
        df = df[df.apply(lambda r: datetime.strptime(r[
            'commit_date'], '%Y-%m-%d %H:%M:%S') < version_date,
                         axis=1)]
        # split by file_name
        data = {}
        issues_data = {}

        extractor = DataExtractor(self.project)
        path = extractor.get_bugged_files_path(self.version, True)
        files = pd.read_csv(path, sep=';')['file_name'].to_list()
        df = df[df.apply(lambda r: r['file_name'].endswith('.java') and r[
            'file_name'] in files,
                         axis=1)]

        for file_name, file_df in df.groupby('file_name', as_index=False):
            norm_name = os.path.normpath(file_name).lower()
            if norm_name not in self.file_analyser.relative_paths:
                continue
            name = self.file_analyser.relative_paths[norm_name]
            data[name] = self._extract_process_features(file_df)
            issues_data[name] = self._extract_issues_features(
                file_df, issues_df, self._get_blame_data(file_name))
        # extract the following features:
        self.data.add(ProcessData(self.project, self.version, data=data)).add(
            IssuesProductData(self.project, self.version,
                              data=issues_data)).add(
                                  IssuesProcessData(self.project,
                                                    self.version,
                                                    data=issues_data))
Пример #3
0
 def _extract(self):
     extractor = DataExtractor(self.project)
     path = extractor.get_bugged_files_path(self.version, True)
     df = pd.read_csv(path, sep=';')
     key = 'file_name'
     assert key in df.columns
     bugged = df.groupby(key).apply(
         lambda x: dict(zip(["is_buggy"], x.is_buggy))).to_dict()
     ans = dict()
     for name, value in bugged.items():
         norm_name = os.path.normpath(name.lower())
         if norm_name in self.file_analyser.relative_paths:
             ans[self.file_analyser.relative_paths[norm_name]] = value
     self.data.set_raw_data(ans)