예제 #1
0
    def update(self):
        """
        updates the Git data stored in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Code2DbUpdate started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            self._update_info_code(repo_id, self._get_import_type(repo_id))

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Code2DbUpdate finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except:
            self._logger.error("Code2DbUpdate failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
예제 #2
0
    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger = self._logging_util.get_logger(self._log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, self._log_path, "info")

            self._logger.info("Git2DbMain started")
            start_time = datetime.now()

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)

            project_id = self._dao.select_project_id(self._project_name)
            self._dao.insert_repo(project_id, self._repo_name)
            repo_id = self._dao.select_repo_id(self._repo_name)
            #info contribution does not need a connection to the db
            self._get_info_contribution(repo_id)
            self._dao.restart_connection()
            self._dao.fix_commit_parent_table(repo_id)
            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbMain finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbMain failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
예제 #3
0
    def __call__(self):
        try:
            log_path = self._log_root_path + "-git2db-" + self._make_it_printable(
                self._ref_name)
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Git2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
    def __call__(self):
        self._logging_util = LoggingUtil()
        log_path = self._log_root_path + "-code2db-" + str(
            self._interval[0].get('commit_id')) + "_" + str(
                self._interval[0].get('file_id')) + "-" + str(
                    self._interval[-1].get('commit_id')) + "_" + str(
                        self._interval[-1].get('file_id'))
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._tmp_root_file = log_path + "-tmp."
            self._git_querier = GitQuerier(self._git_repo_path, self._logger)
            self._code_querier = CodeQuerier(self._logger,
                                             self._tmp_root_file + "txt")
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Code2DbTag failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()
예제 #5
0
class Git2DbReference(object):
    """
    This class handles the import of Git references
    """

    #do not import patches
    LIGHT_IMPORT_TYPE = 1
    #import patches but not at line level
    MEDIUM_IMPORT_TYPE = 2
    #import patches also at line level
    FULL_IMPORT_TYPE = 3

    def __init__(self, db_name, repo_id, git_repo_path, before_date,
                 import_type, ref_name, ref_type, from_sha, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type repo_id: int
        :param repo_id: the id of an existing repository in the DB

        :type git_repo_path: str
        :param git_repo_path: local path of the Git repository

        :type before_date: str
        :param before_date: import data before date (YYYY-mm-dd)

        :type import_type: int
        :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail

        :type ref_name: str
        :param ref_name: the name of the reference to import

        :type from_sha: str
        :param from_sha: the SHA of the commit from where to start the import

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._git_repo_path = git_repo_path
        self._repo_id = repo_id
        self._db_name = db_name
        self._ref_name = ref_name
        self._ref_type = ref_type
        self._before_date = before_date
        self._import_type = import_type
        self._from_sha = from_sha
        self._config = config
        self._logging_util = LoggingUtil()
        self._fileHandler = None
        self._logger = None
        self._querier = None
        self._dao = None

    def __call__(self):
        try:
            log_path = self._log_root_path + "-git2db-" + self._make_it_printable(
                self._ref_name)
            self._logger = self._logging_util.get_logger(log_path)
            self._fileHandler = self._logging_util.get_file_handler(
                self._logger, log_path, "info")

            self._querier = GitQuerier(self._git_repo_path, self._logger)
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Git2Db failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _make_it_printable(self, str):
        #converts string to UTF-8 and removes empty and non-alphanumeric characters
        u = str.decode('utf-8', 'ignore').lower()
        return re.sub(r'(\W|\s)+', '-', u)

    def _get_info_contribution_in_reference(self, reference_name,
                                            reference_type, repo_id, from_sha):
        if from_sha:
            if self._before_date:
                commits = self._querier.collect_all_commits_after_sha_before_date(
                    reference_name, from_sha, self._before_date)
            else:
                commits = self._querier.collect_all_commits_after_sha(
                    reference_name, from_sha)

            self._analyse_commits(commits, reference_name, repo_id)
        else:
            if self._before_date:
                commits = self._querier.collect_all_commits_before_date(
                    reference_name, self._before_date)
            else:
                commits = self._querier.collect_all_commits(reference_name)

            self._analyse_commits(commits, reference_name, repo_id)

    def _load_all_references(self, repo_id):
        # load all git branches and tags into database
        for reference in self._querier.get_references():
            ref_name = reference[0]
            ref_type = reference[1]
            #inserts reference to DB
            self._dao.insert_reference(repo_id, ref_name, ref_type)

    def _get_diffs_from_commit(self, commit, files_in_commit):
        #calculates diffs within files in a commit
        if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
            diffs = self._querier.get_diffs(commit, files_in_commit, True)
        else:
            diffs = self._querier.get_diffs(commit, files_in_commit, False)

        return diffs

    def _analyse_commit(self, commit, repo_id, ref_id):
        #analyses a commit
        try:
            message = self._querier.get_commit_property(commit, "message")
            author_name = self._querier.get_commit_property(
                commit, "author.name")
            author_email = self._querier.get_commit_property(
                commit, "author.email")
            committer_name = self._querier.get_commit_property(
                commit, "committer.name")
            committer_email = self._querier.get_commit_property(
                commit, "committer.email")
            size = self._querier.get_commit_property(commit, "size")
            sha = self._querier.get_commit_property(commit, "hexsha")
            authored_date = self._querier.get_commit_time(
                self._querier.get_commit_property(commit, "authored_date"))
            committed_date = self._querier.get_commit_time(
                self._querier.get_commit_property(commit, "committed_date"))

            if author_name is None and author_email is None:
                self._logger.warning(
                    "author name and email are null for commit: " + sha)

            if committer_name is None and committer_email is None:
                self._logger.warning(
                    "committer name and email are null for commit: " + sha)

            #insert author
            author_id = self._dao.get_user_id(author_name, author_email)
            committer_id = self._dao.get_user_id(committer_name,
                                                 committer_email)

            commit_found = self._dao.select_commit_id(sha, repo_id)

            if not commit_found:
                #insert commit
                self._dao.insert_commit(repo_id, sha, message, author_id,
                                        committer_id, authored_date,
                                        committed_date, size)
                commit_found = self._dao.select_commit_id(sha, repo_id)

                commit_stats_files = commit.stats.files
                try:
                    if self._querier.commit_has_no_parents(commit):
                        for diff in self._querier.get_diffs_no_parent_commit(
                                commit):
                            file_path = diff[0]
                            ext = self._querier.get_ext(file_path)

                            self._dao.insert_file(repo_id, file_path, ext)
                            file_id = self._dao.select_file_id(
                                repo_id, file_path)

                            if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
                                patch_content = re.sub(r'^(\w|\W)*\n@@', '@@',
                                                       diff[1])
                            else:
                                patch_content = None

                            stats = self._querier.get_stats_for_file(
                                commit_stats_files, file_path)
                            status = self._querier.get_status_with_diff(
                                stats, diff)

                            #insert file modification
                            self._dao.insert_file_modification(
                                commit_found, file_id, status, stats[0],
                                stats[1], stats[2], patch_content)

                            if self._import_type == Git2DbReference.FULL_IMPORT_TYPE:
                                file_modification_id = self._dao.select_file_modification_id(
                                    commit_found, file_id)
                                line_details = self._querier.get_line_details(
                                    patch_content, ext)
                                for line_detail in line_details:
                                    self._dao.insert_line_details(
                                        file_modification_id, line_detail)
                    else:
                        for diff in self._get_diffs_from_commit(
                                commit, commit_stats_files.keys()):
                            #self.dao.check_connection_alive()
                            if self._querier.is_renamed(diff):
                                file_previous = self._querier.get_rename_from(
                                    diff)
                                ext_previous = self._querier.get_ext(
                                    file_previous)

                                file_current = self._querier.get_file_current(
                                    diff)
                                ext_current = self._querier.get_ext(
                                    file_current)

                                #insert new file
                                self._dao.insert_file(repo_id, file_current,
                                                      ext_current)

                                #get id new file
                                current_file_id = self._dao.select_file_id(
                                    repo_id, file_current)

                                #retrieve the id of the previous file
                                previous_file_id = self._dao.select_file_id(
                                    repo_id, file_previous)

                                #insert file modification
                                self._dao.insert_file_modification(
                                    commit_found, current_file_id, "renamed",
                                    0, 0, 0, None)

                                if not previous_file_id:
                                    self._dao.insert_file(
                                        repo_id, file_previous, ext_previous)
                                    previous_file_id = self._dao.select_file_id(
                                        repo_id, file_previous)

                                if current_file_id == previous_file_id:
                                    self._logger.warning(
                                        "previous file id is equal to current file id ("
                                        + str(current_file_id) + ") " +
                                        str(sha))
                                else:
                                    file_modification_id = self._dao.select_file_modification_id(
                                        commit_found, current_file_id)
                                    self._dao.insert_file_renamed(
                                        repo_id, current_file_id,
                                        previous_file_id, file_modification_id)

                            else:
                                #insert file
                                #if the file does not have a path, it won't be inserted
                                try:
                                    file_path = self._querier.get_file_path(
                                        diff)

                                    ext = self._querier.get_ext(file_path)

                                    stats = self._querier.get_stats_for_file(
                                        commit_stats_files, file_path)
                                    status = self._querier.get_status_with_diff(
                                        stats, diff)

                                    #if the file is new, add it
                                    if self._querier.is_new_file(diff):
                                        self._dao.insert_file(
                                            repo_id, file_path, ext)
                                    file_id = self._dao.select_file_id(
                                        repo_id, file_path)

                                    if not file_id:
                                        self._dao.insert_file(
                                            repo_id, file_path, ext)
                                        file_id = self._dao.select_file_id(
                                            repo_id, file_path)

                                    if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE:
                                        #insert file modification (additions, deletions)
                                        patch_content = self._querier.get_patch_content(
                                            diff)
                                    else:
                                        patch_content = None

                                    self._dao.insert_file_modification(
                                        commit_found, file_id, status,
                                        stats[0], stats[1], stats[2],
                                        patch_content)

                                    if self._import_type == Git2DbReference.FULL_IMPORT_TYPE:
                                        file_modification_id = self._dao.select_file_modification_id(
                                            commit_found, file_id)
                                        line_details = self._querier.get_line_details(
                                            patch_content, ext)
                                        for line_detail in line_details:
                                            self._dao.insert_line_details(
                                                file_modification_id,
                                                line_detail)
                                except Exception:
                                    self._logger.error(
                                        "Something went wrong with commit " +
                                        str(sha),
                                        exc_info=True)
                except Exception:
                    self._logger.error("Git2Db failed on commit " + str(sha),
                                       exc_info=True)

            # insert parents of the commit
            self._dao.insert_commit_parents(commit.parents, commit_found, sha,
                                            repo_id)
            # insert commits in reference
            self._dao.insert_commit_in_reference(repo_id, commit_found, ref_id)

            #return commit_found
        except Exception:
            self._logger.error("Git2Db failed on commit " + str(sha),
                               exc_info=True)

    def _analyse_commits(self, commits, ref, repo_id):
        #analyses commits in references
        ref_id = self._dao.select_reference_id(repo_id, ref)
        commits_in_reference = []
        for c in commits:
            self._analyse_commit(c, repo_id, ref_id)
            # self.logger.info("analysing commit " + str(commits.index(c)+1) + "/" + str(len(commits)))
            # to_insert = self._analyse_commit(c, repo_id, ref_id)
            # if to_insert:
            #     commits_in_reference.append((repo_id, to_insert, ref_id))
            # self._analyse_commit(c, repo_id, ref_id)

        #self._dao.insert_commits_in_reference(commits_in_reference)

    def extract(self):
        """
        extracts Git data and stores it in the DB
        """
        try:
            self._logger.info("Git2DbReference started")
            start_time = datetime.now()
            self._load_all_references(self._repo_id)
            self._get_info_contribution_in_reference(self._ref_name,
                                                     self._ref_type,
                                                     self._repo_id,
                                                     self._from_sha)

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Git2DbReference finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception:
            self._logger.error("Git2DbReference failed", exc_info=True)
class Code2DbCommitFile():
    """
    This class handles the import of code function data for a set of commit file pairs
    """

    #import overall function statistics per file
    LIGHT_IMPORT_TYPE = 1
    #import import function-level information
    FULL_IMPORT_TYPE = 2

    def __init__(self, db_name, git_repo_path, interval, import_type, config,
                 log_root_path):
        """
        :type db_name: str
        :param db_name: the name of an existing DB

        :type git_repo_path: str
        :param git_repo_path: local path of the Git repository

        :type interval: list dict
        :param interval: a list of commit file pair

        :type import_type: int
        :param import_type: 1 = import overall function statistics per file, 2 = import function-level information

        :type config: dict
        :param config: the DB configuration file

        :type log_root_path: str
        :param log_root_path: the log path
        """
        self._log_root_path = log_root_path
        self._git_repo_path = git_repo_path
        self._db_name = db_name
        self._interval = interval
        self._import_type = import_type
        self._config = config
        self._fileHandler = None
        self._logger = None
        self._git_querier = None
        self._code_querier = None
        self._dao = None
        self._tmp_root_file = None

    def __call__(self):
        self._logging_util = LoggingUtil()
        log_path = self._log_root_path + "-code2db-" + str(
            self._interval[0].get('commit_id')) + "_" + str(
                self._interval[0].get('file_id')) + "-" + str(
                    self._interval[-1].get('commit_id')) + "_" + str(
                        self._interval[-1].get('file_id'))
        self._logger = self._logging_util.get_logger(log_path)
        self._fileHandler = self._logging_util.get_file_handler(
            self._logger, log_path, "info")

        try:
            self._tmp_root_file = log_path + "-tmp."
            self._git_querier = GitQuerier(self._git_repo_path, self._logger)
            self._code_querier = CodeQuerier(self._logger,
                                             self._tmp_root_file + "txt")
            self._dao = GitDao(self._config, self._logger)
            self.extract()
        except Exception:
            self._logger.error("Code2DbTag failed", exc_info=True)
        finally:
            if self._dao:
                self._dao.close_connection()

    def _save_content(self, content, target):
        file = codecs.open(target, "w+", "utf-8")
        file.write(content)
        file.close()

    def _delete_tmp_files(self, targets):
        for target in targets:
            if os.path.exists(target):
                os.remove(target)

    def _process_commit_file(self):
        _tmp_files = set()
        for i in self._interval:
            try:
                commit_id = i.get("commit_id")
                commit_sha = i.get("commit_sha")
                file_id = i.get("file_id")
                file_name = i.get("file_name")
                file_ext = i.get("file_ext")

                found = self._dao.select_code_at_commit(commit_id, file_id)
                if file_ext not in CodeQuerier.FORBIDDEN_EXTENSIONS and not found:
                    check_extension = False
                    if file_ext in CodeQuerier.ALLOWED_EXTENSIONS:
                        check_extension = True

                    if not file_ext:
                        file_ext = "unknown"

                    _tmp_file = self._tmp_root_file + file_ext
                    _tmp_files.add(_tmp_file)

                    file_content_at_revision = self._git_querier.get_file_content(
                        commit_sha, file_name)
                    if file_content_at_revision:
                        self._save_content(file_content_at_revision, _tmp_file)

                        if check_extension:
                            file_info, fun_info = self._code_querier.get_complexity_info(
                                _tmp_file, self._import_type)
                            self._dao.insert_code_at_commit(
                                commit_id, file_id, file_info.get('ccn'),
                                file_info.get('loc'),
                                file_info.get('comments'),
                                file_info.get('blanks'), file_info.get('funs'),
                                file_info.get('tokens'),
                                file_info.get('avg_ccn'),
                                file_info.get('avg_loc'),
                                file_info.get('avg_tokens'))

                            if self._import_type == Code2DbCommitFile.FULL_IMPORT_TYPE:
                                for fi in fun_info:
                                    self._dao.insert_function(
                                        fi.get('name'), file_id,
                                        fi.get('args'), fi.get('loc'),
                                        fi.get('tokens'), fi.get('lines'),
                                        fi.get('ccn'), fi.get('start'),
                                        fi.get('end'))

                                    fun_id = self._dao.select_function_id(
                                        file_id, fi.get('start'),
                                        fi.get('end'))

                                    self._dao.insert_function_at_commit(
                                        fun_id, commit_id)

                        else:
                            file_info = self._code_querier.get_comment_info(
                                _tmp_file)
                            self._dao.insert_code_at_commit(
                                commit_id, file_id, None, file_info.get('loc'),
                                file_info.get('comments'),
                                file_info.get('blanks'), None, None, None,
                                None, None)

                    if len(_tmp_files) >= 20:
                        self._delete_tmp_files(_tmp_files)

            except Exception:
                self._logger.error("Code2DbCommitFile failed on pair " +
                                   str(commit_sha) + ", " + str(file_name),
                                   exc_info=True)

        if _tmp_files:
            self._delete_tmp_files(_tmp_files)

    def extract(self):
        """
        extracts code function data and stores it in the DB
        """
        try:
            self._logger.info("Code2DbCommitFile started")
            start_time = datetime.now()

            self._process_commit_file()

            end_time = datetime.now()
            minutes_and_seconds = self._logging_util.calculate_execution_time(
                end_time, start_time)
            self._logger.info("Code2DbCommitFile finished after " +
                              str(minutes_and_seconds[0]) + " minutes and " +
                              str(round(minutes_and_seconds[1], 1)) + " secs")
            self._logging_util.remove_file_handler_logger(
                self._logger, self._fileHandler)
        except Exception, e:
            self._logger.error("Code2DbCommitFile failed", exc_info=True)