Exemplo n.º 1
0
def main():
    cli_args = Config.parse_arguments()

    # A config file must be provided, or else nothing will work.
    if not hasattr(cli_args, 'config_file') or not cli_args.config_file:
        Log.error("A config file must be specified!")
        return
    Config.parse_config(cli_args.config_file)

    Log.config()

    Log.info("Started. Creating database")
    DB.create_db()

    db_session = DB.create_session()

    miner = RepositoryMiner(
        Config.repository_path,
        db_session=db_session,
        branch=Config.repository_branch
    )
    repository = miner.repository_orm

    IssueScanner.assign_issue_tracking(
        repository,
        Config.issue_tracking_system,
        Config.issue_tracking_url,
        Config.issue_tracking_username,
        Config.issue_tracking_password, db_session=db_session)

    IssueScanner.scan_for_repository(repository)
    db_session.close()
Exemplo n.º 2
0
    def init_db_sessions(self, db_session=None):
        """ Init DB session. When treahding is activated it creates one db session per thread

        Args:
            db_session: Optional if not specified it will create a new one
        """
        if db_session == None:
            self.db_session = DB.create_session()
        else:
            self.db_session = db_session
Exemplo n.º 3
0
def scan_for_repository(repository):
    """ Scans the issue tracking of a repository in the DB and assigns issues to commits.

    Iterates through all recorded commits of this repository, checks their commit message for issue references,
    trys to retrieve those issues from the associated issue tracking system and saves them in the DB.

    Args:
        repository (Repository): The the repository to scan.
    """
    assert isinstance(repository, Repository)

    reset_issue_cache()

    # get issue tracking object
    Log.info("Retrieving IssueTracking for Repository " + repository.name + " with id " + str(repository.id))
    db_session = DB.create_session()
    query = db_session.query(IssueTracking).filter(IssueTracking.repository == repository)
    try:
        issue_tracking = query.one()
    except NoResultFound:
        Log.error("No IssueTracking-Entry found for Repository " + repository.name + " with id " + str(repository.id))
        db_session.close()
        return
    Log.debug("IssueTracking found. Type: " + str(issue_tracking.type))

    if issue_tracking.type == TYPE_GITHUB:
        retrieve = GitHub.retrieve
        extract_pattern = '#[0-9]+'
        transform = lambda x: x[1:]
    elif issue_tracking.type == TYPE_JIRA:
        retrieve = Jira.retrieve
        extract_pattern = Config.issue_scanner_issue_id_regex
        if not extract_pattern:
            extract_pattern = '[A-Z][A-Z]+-[0-9]*'  # default extract pattern, not really good
        transform = None
    else:
        Log.error("No Implementation found for IssueTracking-Type '" + str(issue_tracking.type) + "'")
        db_session.close()
        return

    repository = issue_tracking.repository
    for commit in repository.commits:
        issue_ids = extract_issue_ids(commit.message, extract_pattern, transform=transform)
        for issue_id in issue_ids:
            process_issue(issue_tracking, commit, issue_id, retrieve, db_session)

    Log.info("Issue Analysis completed")
    db_session.close()
    reset_issue_cache()
Exemplo n.º 4
0
def assign_issue_tracking(repository, issue_tracking_type, url, username=None, password=None, db_session=None):
    """ Assigns

    Args:
        repository (Repository): The repository (ORM-Object) to assign the issue tracking to.
        issue_tracking_type (str): The issue tracking system type. Use one of the TYPE_X constants from IssueTracking.
        url (str): The url for the issue tracking API.
        username (str): Optional. The username for authentication.
        password (str): Optional. The password for authentication.
        db_session (Session): Optional. The db session to use. If not provided, a new one will be created.
    """
    assert isinstance(repository, Repository)

    close_db_session = False
    if not db_session:
        db_session = DB.create_session()
        close_db_session = True

    if repository.issueTracking is not None:
        Log.info("Repository " + repository.name + " with id " + str(
            repository.id) + " already has an issue tracker assigned")

        repository.issueTracking.type = issue_tracking_type
        repository.issueTracking.url = url
        repository.issueTracking.username = username
        repository.issueTracking.password = password
        db_session.commit()
    else:
        Log.info(
            "Creating new " + issue_tracking_type + " IssueTracking for Repository " + repository.name +
            " with id " + str(repository.id))
        issue_tracking = IssueTracking(
            repository=repository,
            type=issue_tracking_type,
            url=url,
            username=username,
            password=password
        )
        db_session.add(issue_tracking)

        repository.issueTracking = issue_tracking
        db_session.commit()

    if close_db_session:
        db_session.close()
Exemplo n.º 5
0
    def __process_commit(self, commit, previous_commit, project_size, project_file_count, db_session=None):
        """Process a single commit.

        Args:
            commit: Actual commit
            previous_commit: Previous commit for creating differences
            project_size: Actual size of the project
            project_file_count: Acutal filecount of the project
            db_session: db session...
        Returns: commit_orm object

        """
        db_session_local = None
        if not db_session:
            db_session_local = True
            db_session = DB.create_session()

        added_files_thread = None
        changed_files_thread = None
        deleted_files_thread = None

        manipulated_files = self.__get_changed_files(commit, previous_commit)

        added_files = manipulated_files['added_files']
        added_files_count = len(added_files)
        deleted_files = manipulated_files['deleted_files']
        deleted_files_count = len(deleted_files)
        changed_files = manipulated_files['changed_files']
        changed_files_count = len(changed_files)
        renamed_files = manipulated_files['renamed_files']
        renamed_files_count = len(renamed_files)
        files_diff = manipulated_files['files_diff']

        new_project_file_count = project_file_count + added_files_count - deleted_files_count

        commit_time = datetime.datetime.utcfromtimestamp(commit.committed_date)
        commit_id = str(commit)

        commit_orm = self.__create_new_commit(db_session, commit_id, self.repository_id, commit.message,
                                              commit.author.email,
                                              commit_time, 0, 0, 0, 0, project_size, new_project_file_count)

        # no files were changed at all / very unlikley
        if (not added_files) and (not deleted_files) and (not changed_files) and (not renamed_files) and (
                not renamed_files):
            return commit_orm

        if added_files:
            for file in added_files:
                programming_language = self.__get_programming_langunage(file.path)

                file_orm = self.__create_new_file(db_session, self.repository_id,
                                                  programming_language)

                created_version = self.__create_new_version(db_session, file_orm.id, commit_id, 0, 0, 0, file.path)

                # skip this file because language is not interessting for us
                if not programming_language:
                    added_files_count -= 1
                    continue

                self.__process_file_diff(db_session, commit_id, file, files_diff, created_version)

        if deleted_files:
            for file in deleted_files:
                programming_language = self.__get_programming_langunage(file.path)
                if not programming_language:
                    deleted_files_count -= 1

                try:
                    version_orm = self.__process_deleted_or_changed_file(db_session, commit_id, file,
                                                                         programming_language,
                                                                         files_diff)
                    version_orm.deleted = True
                    version_orm.file_size = 0
                except ValueError as e:
                    Log.warning("Warning processing commit: " + str(commit_id) + ". File affected: " + str(
                        file.path) + " Reason: " + str(e))

        if changed_files:
            for file in changed_files:
                programming_language = self.__get_programming_langunage(file.path)
                if not programming_language:
                    changed_files_count -= 1

                try:
                    self.__process_deleted_or_changed_file(db_session, commit_id, file, programming_language,
                                                           files_diff)
                except ValueError as e:
                    Log.warning("Warning processing commit: " + str(commit_id) + ". File affected: " + str(
                        file.path) + " Reason: " + str(e))

        # for renamed files just create a new one and link to the old one
        if renamed_files:
            for file in renamed_files:
                old_file = file['old_file']
                new_file = file['new_file']

                old_version_orm = db_session.query(Commit, Version).filter(Commit.id == Version.commit_id,
                                                                           Version.path == str(old_file.path),
                                                                           Commit.repository_id == str(
                                                                               self.repository_id)).order_by(
                    desc(Commit.timestamp)).first()

                programming_language = self.__get_programming_langunage(new_file.path)

                if not old_version_orm:
                    Log.warning("Could not process commit " + str(
                        commit_id) + ". Could not process rename because old file was not found. Old file: " + str(
                        old_file.path) + " new file: " + str(new_file.path))
                    file_orm = self.__create_new_file(db_session, self.repository_id,
                                                      programming_language)
                    old_version_orm = self.__create_new_version(db_session, file_orm.id, commit_id, 0, 0, 0,
                                                                new_file.path)
                    version_orm = old_version_orm
                else:
                    old_version_orm = old_version_orm.Version
                    version_orm = self.__create_new_version(db_session, old_version_orm.file_id, commit_id, 0, 0, 0,
                                                        new_file.path)

                # skip this file because language is not interessting for us
                if not programming_language:
                    renamed_files_count -= 1
                    continue

                version_orm.file_size = old_version_orm.file_size
                self.__process_file_diff(db_session, commit_id, new_file, files_diff, version_orm)

        commit_orm.added_files_count = added_files_count
        commit_orm.deleted_files_count = deleted_files_count
        commit_orm.changed_files_count = changed_files_count
        commit_orm.renamed_files_count = renamed_files_count

        if added_files_thread:
            added_files_thread.join()
        if changed_files_thread:
            changed_files_thread.join()
        if deleted_files_thread:
            deleted_files_thread.join()

        if db_session_local:
            db_session.close()

        return commit_orm
Exemplo n.º 6
0
def get_dataset_from_db(repository,
                        start,
                        end,
                        feature_list,
                        target_id,
                        ngram_sizes=None,
                        ngram_levels=None,
                        label="",
                        eager_load=False,
                        sparse=False):
    """ Reads a dataset from a repository in a specific time range

    Args:
        repository (Repository): The repository to query. Can also be its name as a string
        start (datetime): The start range
        end (datetime): The end range
        feature_list (list[str]): A list of the feature-IDs to be read into the dataset.
        target_id (str): The ID of the target. Use a TARGET_X constant from UpcomingBugsForVersion
        ngram_sizes (list[int]): Optional. The ngram-sizes to be loaded in the set (e.g. [1, 2] for 1-grams and 2-grams)
        ngram_levels (list[int]): Optional. The ngram-levels to be loaded in the dataset.
        label (str): The label to be assigned to the dataset.
        eager_load (bool): If true, all data will be loaded eagerly. This reduces database calls, but uses a lot of RAM.
        sparse (bool): If the data and target matrices should be sparse. Recommended in combination with ngrams.

    Returns:
        Dataset: The populated dataset.
    """
    if ngram_sizes and type(ngram_sizes) != list:
        ngram_sizes = [ngram_sizes]
    if ngram_levels and type(ngram_levels) != list:
        ngram_sizes = [ngram_levels]
    use_ngrams = True if ngram_sizes and ngram_levels else False

    session = DB.create_session()

    if type(repository) is str:
        repository_name = repository
        repository = get_repository_by_name(session, repository_name)
        if repository is None:
            logging.error(
                "Repository with name %s not found! Returning no Dataset" %
                repository_name)
            return None

    commits = get_commits_in_range(session,
                                   repository,
                                   start,
                                   end,
                                   eager_load_ngrams=use_ngrams and eager_load,
                                   eager_load_features=eager_load)
    if commits is None:
        logging.error("Could not retrieve commits! Returning no Dataset")
        return None
    logging.debug("Commits received.")

    if len(commits) == 0:
        logging.error("No Commits found!")
        return None

    versions = []
    for commit in commits:
        versions += commit.versions
    logging.debug("%i commits with %i versions found." %
                  (len(commits), len(versions)))

    feature_count = len(feature_list)
    logging.debug("%i features found." % feature_count)

    ngram_count = 0
    if use_ngrams:
        ngrams = get_ngram_vector_list(versions[0], ngram_sizes, ngram_levels)
        ngram_count = sum([ngram.vector_size for ngram in ngrams])
        logging.debug(
            "Ngram sizes %s and levels %s amount to %i total ngrams." %
            (str(ngram_sizes), str(ngram_levels), ngram_count))

    dataset = Dataset(feature_count + ngram_count,
                      len(versions),
                      feature_list,
                      target_id,
                      start,
                      end,
                      ngram_sizes,
                      ngram_levels,
                      label,
                      sparse=sparse,
                      dok=True)
    i = 0
    for version in versions:
        if len(version.upcoming_bugs) == 0:
            raise Exception(
                "Version %s has no upcoming_bugs entry. Can't retrieve target!"
                % version.id)
        target = version.upcoming_bugs[0].get_target(target_id)
        if target is None:
            raise Exception(
                "Upcoming_bugs entry of Version %s has no target %s!" %
                (version.id, target))
        dataset.target[i] = target

        j = 0
        for feature_value in version.feature_values:
            if feature_value.feature_id in feature_list:
                if not sparse or feature_value.value != 0:
                    dataset.data[i, j] = feature_value.value
                j += 1
        if use_ngrams:
            for ngram_vector in get_ngram_vector_list(version, ngram_sizes,
                                                      ngram_levels):
                for ngram_value in ngram_vector.ngram_values.split(','):
                    ngram_value = int(ngram_value)
                    if not sparse or ngram_value != 0:
                        dataset.data[i, j] = ngram_value
                    j += 1

        if i % 100 == 0:
            logging.info("{0:.2f}% of versions processed.".format(
                i / len(versions) * 100))

        i += 1
    logging.info("All versions processed.")

    if sparse:
        dataset.to_csr()

    session.close()
    return dataset