Пример #1
0
 def _set_data(self):
     self.data = CompositeData()
Пример #2
0
 def _set_data(self):
     # self.data = CheckstyleData(self.project, self.version)
     self.data = CompositeData()
Пример #3
0
class ProcessExtractor(Extractor):
    def __init__(self, project: Project, version, repo=None):
        super().__init__(
            "ProcessExtractor", project, version,
            [DataType.ProcessFilesDataType, DataType.IssuesFilesDataType],
            repo)

    def _set_data(self):
        self.data = CompositeData()

    def clean(self, s):
        return "".join(list(filter(lambda c: c.isalpha(), s)))

    def _extract(self):
        # get version_date from apache_versions
        config = Config().config
        repository_data = config["CACHING"]["RepositoryData"]
        path = os.path.join(repository_data,
                            config['DATA_EXTRACTION']["Versions"],
                            self.project.github(),
                            self.project.jira() + ".csv")
        df = pd.read_csv(path, sep=';')
        version_date = df[df['version_name'] ==
                          self.version]['version_date'].to_list()[0]
        version_date = datetime.strptime(version_date, '%Y-%m-%d %H:%M:%S')
        # get file list from committed_files
        path = os.path.join(repository_data,
                            config['DATA_EXTRACTION']["CommittedFiles"],
                            self.project.github(),
                            self.project.jira() + ".csv")
        df = pd.read_csv(path, sep=';')
        issues_path = os.path.join(repository_data,
                                   config['DATA_EXTRACTION']["Issues"],
                                   self.project.github(),
                                   self.project.jira() + "_dummies.csv")
        issues_df = pd.read_csv(issues_path, sep=';')
        issues_df = df[['commit_id', 'issue_id']].merge(issues_df,
                                                        on=['issue_id'],
                                                        how='right')
        # filter commits after version date
        df = df[df.apply(lambda r: datetime.strptime(r[
            'commit_date'], '%Y-%m-%d %H:%M:%S') < version_date,
                         axis=1)]
        # split by file_name
        data = {}
        issues_data = {}

        extractor = DataExtractor(self.project)
        path = extractor.get_bugged_files_path(self.version, True)
        files = pd.read_csv(path, sep=';')['file_name'].to_list()
        df = df[df.apply(lambda r: r['file_name'].endswith('.java') and r[
            'file_name'] in files,
                         axis=1)]

        for file_name, file_df in df.groupby('file_name', as_index=False):
            data[file_name] = self._extract_process_features(file_df)
            issues_data[file_name] = self._extract_issues_features(
                file_df, issues_df, self._get_blame_data(file_name))
        # extract the following features:
        self.data.add(ProcessData(self.project, self.version, data=data)).add(
            IssuesData(self.project, self.version, data=issues_data))

    def _get_blame_data(self, file_name):
        repo = git.Repo(self.local_path)
        version_names = list(map(lambda x: x.name, repo.tags))
        version = self.version
        if version not in version_names:
            if '\\' in version:
                if version.replace('\\', '/') in version_names:
                    version = version.replace('\\', '/')
            if '/' in version:
                if version.replace('/', '\\') in version_names:
                    version = version.replace('/', '\\')
        blame = repo.blame(version, file_name)
        blame = reduce(
            list.__add__,
            map(lambda x: list(map(lambda y: (x[0], y), x[1])), blame), [])
        commits, source_code = list(zip(*blame))
        lines = CommentFilter().filterComments(source_code)[0]
        values = []
        for c, l in zip(commits, lines):
            d = {"commit_id": c.hexsha}
            for k, v in l.getValuesVector().items():
                d['blame_' + k] = v
            values.append(d)
        return pd.DataFrame(values)

    def _get_blame_for_file(self, file_name):
        ans = {}
        repo = git.Repo(self.local_path)
        blame = repo.blame(self.version, file_name)
        ans['blobs'] = len(blame)
        blame = reduce(
            list.__add__,
            map(lambda x: list(map(lambda y: (x[0], y), x[1])), blame), [])
        commits, source_code = list(zip(*blame))
        ans['blame_commits'] = len(set(commits))
        c = dict(Counter(list(map(lambda x: x.hexsha, commits))))
        list(
            map(lambda x: c.update({x: c[x] / ans['blame_commits']}),
                c.keys()))
        for k, v in pd.DataFrame(
                c.values(),
                columns=['col']).describe().to_dict()['col'].items():
            ans['blame_' + k] = v

        lines = CommentFilter().filterComments(source_code)[0]
        commits_lines = list(filter(lambda x: x[1], zip(commits, lines)))
        filtered_commits, _ = set(zip(*commits_lines))
        c = dict(Counter(list(map(lambda x: x.hexsha, filtered_commits))))
        ans['blame_filtered_commits'] = len(set(zip(*commits_lines)))
        list(
            map(lambda x: c.update({x: c[x] / ans['blame_filtered_commits']}),
                c.keys()))
        for k, v in pd.DataFrame(
                c.values(),
                columns=['col']).describe().to_dict()['col'].items():
            ans['blame_filtered_' + k] = v
        values = []
        for c in set(filtered_commits):
            l = list(
                map(lambda l: l[1], filter(lambda l: l[0] == c,
                                           commits_lines)))
            h = Halstead(l).getValuesVector()
            values.append(h)
        for col, d in pd.DataFrame(values).describe().to_dict().items():
            for k, v in d.items():
                ans['blame_halstead_' + col + "_" + k] = v
        return ans

    def _get_features(self, d, initial=''):
        ans = {initial + "_count": d.shape[0]}
        des = d.describe()
        des = des.drop(['25%', '50%', '75%'])
        for col in des:
            for metric in des.index.to_list():
                # set default value
                ans["_".join([initial, col, metric])] = 0.0
        for col in des:
            for k, v in des[col].to_dict().items():
                if v and not math.isnan(v):
                    ans["_".join(
                        [self.clean(initial),
                         self.clean(col),
                         self.clean(k)])] = v
        return ans

    def _extract_process_features(self, df):
        df = df.drop([
            'file_name', 'is_java', 'commit_id', 'commit_date', 'commit_url',
            'bug_url'
        ],
                     axis=1)
        ans = {}
        ans.update(
            self._get_features(df.drop('issue_id', axis=1), "all_process"))
        return ans

    def _extract_issues_features(self, df, issues_df, blame):
        ans = {}
        # d = df[['commit_id', 'issue_id']]
        # blame_merge = d.merge(blame, on=['commit_id'], how='right')
        blame_merge = blame.merge(issues_df, on=['commit_id'], how='left')
        blame_merge = blame_merge.drop(['commit_id', 'issue_id'], axis=1)
        ans.update(self._get_features(blame_merge, "blame_merge"))
        df = df.drop([
            'file_name', 'is_java', 'commit_id', 'commit_date', 'commit_url',
            'bug_url'
        ],
                     axis=1)
        ans.update(
            self._get_features(
                df[df['issue_id'] != '0'].drop('issue_id', axis=1), "fixes"))
        ans.update(
            self._get_features(
                df[df['issue_id'] == '0'].drop('issue_id', axis=1),
                "non_fixes"))
        merged = df.merge(issues_df.drop(['commit_id'], axis=1),
                          on=['issue_id'],
                          how='inner')
        merged = merged.drop(['key', 'issue_id'], axis=1)
        ans.update(self._get_features(merged, 'issues'))

        # for dummy in dummies_dict:
        #     # percent
        #     for d in dummies_dict[dummy]:
        #         ans.update(self._get_features(merged[merged[d] == 1].drop(d, axis=1), d))
        #         ans.update(self._get_features(blame_merge[blame_merge[d] == 1], d))

        return ans
Пример #4
0
class Checkstyle(Extractor):
    def __init__(self, project: Project, version, repo=None):
        super().__init__("Checkstyle", project, version, [
            DataType.CheckstyleFileDataType, DataType.CheckstyleMethodDataType
        ], repo)
        self.out_path_to_xml = os.path.normpath(
            Config.get_work_dir_path(
                os.path.join(Config().config['CACHING']['RepositoryData'],
                             Config().config['TEMP']['Checkstyle'])))

    def _set_data(self):
        # self.data = CheckstyleData(self.project, self.version)
        self.data = CompositeData()

    def _extract(self):
        all_checks_xml = self._get_all_checks_xml(self.config)
        self._execute_command(self.runner, all_checks_xml, self.local_path,
                              self.out_path_to_xml.replace("\\\\?\\", ""))
        checkstyle_files, checkstyle_methods = self._process_checkstyle_data(
            self.out_path_to_xml)
        self.data.add(CheckstyleFileData(self.project, self.version, data=checkstyle_files))\
            .add(CheckstyleMethodData(self.project, self.version, data=checkstyle_methods))

    @staticmethod
    def _get_all_checks_xml(config):
        externals_path = config['EXTERNALS']['BaseDir']
        all_checks_xml_name = config['EXTERNALS']['AllChecks']
        externals = Config.get_work_dir_path(externals_path)
        all_checks_xml = os.path.join(externals, all_checks_xml_name)
        return all_checks_xml

    @staticmethod
    def _execute_command(checkstyle_runner: str, all_checks_xml: str,
                         local_path: str, out_path_to_xml: str) -> str:
        commands = [
            "java", "-jar", checkstyle_runner, "-c", all_checks_xml, "-f",
            "xml", "-o",
            out_path_to_xml.replace("\\\\?\\", ""), local_path
        ]
        execute_timeout(commands)
        return out_path_to_xml

    def _process_checkstyle_data(self, out_path_to_xml):
        checkstyle_methods = {}
        checkstyle_files = {}
        methods_keys = set()
        files_keys = set()
        methods_ = {}
        files_ = {}
        with open(out_path_to_xml, "r", encoding="utf-8") as file:
            root = ElementTree.parse(file).getroot()
            for file_element in root:
                try:
                    filepath = file_element.attrib['name'].lower()
                except:
                    continue
                if not filepath.endswith(".java"):
                    continue
                files_, files_keys, methods_, methods_keys = self._get_items(
                    file_element, os.path.realpath(filepath), files_,
                    files_keys, methods_, methods_keys)
        for method_id in methods_:
            checkstyle_methods[method_id] = dict.fromkeys(methods_keys, 0)
            checkstyle_methods[method_id].update(methods_[method_id])
        for file_id in files_:
            checkstyle_files[file_id] = dict.fromkeys(files_keys, 0)
            checkstyle_files[file_id].update(files_[file_id])
        return checkstyle_files, checkstyle_methods

    def _get_items(self, file_element, file_path, files_, files_keys, methods_,
                   methods_keys):
        for errorElement in file_element:
            line = int(errorElement.attrib['line'])
            if "max allowed" not in errorElement.attrib['message']:
                continue
            key = "_".join(errorElement.attrib['message'] \
                           .replace("lines", "") \
                           .replace(",", "") \
                           .split('(')[0] \
                           .split()[:-2])

            value = int(errorElement.attrib['message'] \
                        .replace("lines", "") \
                        .replace(",", "") \
                        .split('(')[0] \
                        .split()[-1] \
                        .strip())
            method_id = self.file_analyser.get_closest_id(file_path, line)
            if method_id:
                if "npath" in key.lower():
                    value = min(10000, int(value))
                methods_.setdefault(method_id, dict())[key] = value
                methods_keys.add(key)
            else:
                files_.setdefault(file_path.lower(), dict())[key] = value
                files_keys.add(key)
        return files_, files_keys, methods_, methods_keys