Exemplo n.º 1
0
def print_to_disk(issues, results_folder):
    """
    Print issues to file "issues-github.list" in the results folder.

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues-github.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues-github.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        for event in issue["eventsList"]:
            lines.append((
                issue["number"],
                issue["title"],
                json.dumps(issue["type"]),
                issue["state_new"],
                json.dumps(issue["resolution"]),
                issue["created_at"],
                issue["closed_at"],
                json.dumps([]),  # components
                event["event"],
                event["user"]["name"],
                event["user"]["email"],
                event["created_at"],
                event["event_info_1"],
                json.dumps(event["event_info_2"])))

    # write to output file
    csv_writer.write_to_csv(
        output_file, sorted(set(lines), key=lambda line: lines.index(line)))
Exemplo n.º 2
0
def merge_user_with_user_from_csv(user, persons):
    """
    Merge list of given users with list of already known users.

    :param user: list of users to be merged
    :param persons: contains maps of names/usernames to persons from JIRA (incl. e-mail addresses),
                    see function "load_csv"
    :return: list of merged users
    """

    new_user = dict()
    name_utf8 = unicode(user["name"]).encode("utf-8")
    username_utf8 = unicode(user["username"].lower()).encode("utf-8")

    if username_utf8 in persons["by_username"].keys():
        new_user["username"] = username_utf8
        new_user["name"] = unicode(
            persons["by_username"].get(username_utf8)[0]).encode("utf-8")
        new_user["email"] = unicode(
            persons["by_username"].get(username_utf8)[1]).encode("utf-8")
    elif name_utf8 in persons["by_name"].keys():
        new_user["username"] = username_utf8
        new_user["name"] = unicode(
            persons["by_name"].get(name_utf8)[0]).encode("utf-8")
        new_user["email"] = unicode(
            persons["by_name"].get(name_utf8)[1]).encode("utf-8")
    else:
        new_user["username"] = username_utf8
        new_user["name"] = name_utf8
        new_user["email"] = unicode(user["email"]).encode("utf-8")
        log.warning("User not in csv-file: " + str(user))

    log.info("current User: "******",    new user: " + str(new_user))
    return new_user
def print_to_disk(issues, results_folder):
    """
    Print issues to file "issues.list" in result folder.
    This format is outdated but still used by the network library.
    TODO When the network library is updated, this method can be overwritten by "print_to_disk_new".

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        for event in issue["eventsList"]:
            lines.append((issue["number"], issue["state"], issue["created_at"],
                          issue["closed_at"], issue["isPullRequest"],
                          event["user"]["name"], event["user"]["email"],
                          event["created_at"], "" if event["ref_target"] == ""
                          else event["ref_target"]["name"], event["event"]))

    # write to output file
    csv_writer.write_to_csv(output_file, lines)
def print_to_disk(issues, results_folder):
    """
    Print issues to file "issues-jira.list" in result folder

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues-jira.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues-jira.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        log.info("Current issue '{}'".format(issue["externalId"]))
        lines.append((issue["author"]["name"],
                      issue["author"]["email"],
                      issue["externalId"],
                      issue["creationDate"],
                      issue["externalId"],
                      issue["type"]))
        for comment in issue["comments"]:
            lines.append((
                comment["author"]["name"],
                comment["author"]["email"],
                comment["id"],
                comment["changeDate"],
                issue["externalId"],
                "comment"
            ))

    # write to output file
    csv_writer.write_to_csv(output_file, lines, append=True)
def load_xml(source_folder):
    """Load issues from disk.

    :param source_folder: the folder where to find .xml-files
    :return: the loaded issue data
    """

    filelist = [
        f for f in os.listdir(source_folder)
        if os.path.isfile(os.path.join(source_folder, f))
    ]
    issue_data = list()
    for file in filelist:
        srcfile = os.path.join(source_folder, file)
        log.devinfo("Loading issues from file '{}'...".format(srcfile))

        # check if file exists and exit early if not
        if not os.path.exists(srcfile):
            log.info("Issue file '{}' does not exist! Exiting early...".format(
                srcfile))
            sys.exit(-1)

        # with open(srcfile, 'r') as issues_file:
        xmldoc = parse(srcfile)
        issue_data.append(xmldoc)

    return issue_data
def run():
    # get all needed paths and argument for the method call.
    parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction')
    parser.add_argument('-c', '--config', help="Codeface configuration file", default='codeface.conf')
    parser.add_argument('-p', '--project', help="Project configuration file", required=True)
    parser.add_argument('resdir', help="Directory to store analysis results in")

    # parse arguments
    args = parser.parse_args(sys.argv[1:])
    __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project))

    # create configuration
    __conf = Configuration.load(__codeface_conf, __project_conf)

    # get source and results folders
    __srcdir = os.path.abspath(os.path.join(args.resdir, __conf['repo'] + "_issues"))
    __resdir = os.path.abspath(os.path.join(args.resdir, __conf['project'], __conf["tagging"]))

    # run processing of issue data:
    # 1) load the list of issues
    issues = load(__srcdir)
    # 2) re-format the issues
    issues = reformat_issues(issues)
    # 3) merges all issue events into one list
    issues = merge_issue_events(issues)
    # 4) re-format the eventsList of the issues
    issues = reformat_events(issues)
    # 5) update user data with Codeface database
    issues = insert_user_data(issues, __conf)
    # 6) dump result to disk
    print_to_disk(issues, __resdir)

    log.info("Github issue processing complete!")
def print_to_disk_extr(issues, results_folder):
    """
    Print issues to file "issues.list" in result folder

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        log.info("Current issue '{}'".format(issue["externalId"]))

        lines.append((
            issue["externalId"],
            issue["state"],
            issue["creationDate"],
            issue["resolveDate"],
            False,  ## Value of is.pull.request
            issue["author"]["name"],
            issue["author"]["email"],
            issue["creationDate"],
            "",  ## ref.name
            "open"  ## event.name
        ))

        lines.append((
            issue["externalId"],
            issue["state"],
            issue["creationDate"],
            issue["resolveDate"],
            False,  ## Value of is.pull.request
            issue["author"]["name"],
            issue["author"]["email"],
            issue["creationDate"],
            "",  ## ref.name
            "commented"  ## event.name
        ))

        for comment in issue["comments"]:
            lines.append((
                issue["externalId"],
                issue["state"],
                issue["creationDate"],
                issue["resolveDate"],
                False,  ## Value of is.pull.request
                comment["author"]["name"],
                comment["author"]["email"],
                comment["changeDate"],
                "",  ## ref.name
                "commented"  ## event.name
            ))
    # write to output file
    csv_writer.write_to_csv(output_file, lines, append=True)
def run_extraction(conf, resdir, extract_commit_messages, extract_impl,
                   extract_on_range_level):
    """
    Runs the extraction process for the list of given parameters.

    :param conf: the Codeface configuration object
    :param resdir: the Codeface results dir, where output files are written
    """

    log.info("%s: Extracting data" % conf["project"])

    # initialize database manager with given configuration
    dbm = DBManager(conf)

    # get all types of extractions, both project-level and range-level
    __extractions_project, __extractions_range = extractions.get_extractions(
        dbm, conf, resdir, csv_writer, extract_commit_messages, extract_impl,
        extract_on_range_level)

    # run project-level extractions
    for extraction in __extractions_project:
        extraction.run()

    # run range-level extractions (only if explicitely enabled)
    if extract_on_range_level:

        # check if list of revisions in database is the same as in the config file
        revs = conf["revisions"]
        list_of_revisions = extractions.RevisionExtraction(
            dbm, conf, resdir, csv_writer).get_list()
        if revs:
            if set(revs) != set(list_of_revisions):
                log.error(
                    "List of revisions in configuration file do not match the list stored in the DB! Stopping now."
                )
                sys.exit(1)
            else:
                log.info(
                    "List of revisions in configuration file and DB match.")
        else:
            log.info(
                "No list of revisions found in configuration file, using the list from the DB instead!"
            )
            revs = list_of_revisions  # set list of revisions as stored in the database

        # for all revisions of this project
        for i in range(len(revs) - 1):
            start_rev = revs[i]
            end_rev = revs[i + 1]
            range_number = i + 1

            log.info("%s: Extracting data for range %s [version '%s']" %
                     (conf["project"], range_number, end_rev))

            for extraction in __extractions_range:
                extraction.run(range_number, start_rev, end_rev)

    log.info("Extraction complete!")
Exemplo n.º 9
0
def parse(mbox_name, results_folder, include_filepath, files_as_artifacts,
          reindex, append_result):
    """Parse the given mbox file with the commit information from the results folder.

    :param mbox_name: the mbox file to search in
    :param results_folder: the results folder for index and commit information
    :param include_filepath: indicator whether to use the 'file name' part of the artifact into account
    :param files_as_artifacts: indicator whether to search for files (base names) as artifacts
    :param reindex: force reindexing if True
    :param append_result: flag whether to append the results for the current mbox file to the output file
    """

    # load mbox file
    mbox = mailbox.mbox(mbox_name)

    # create schema for text search
    analyzer = StandardAnalyzer(
        expression=r"[^\s,:\"']+"
    )  # split by whitespace, commas, colons, and quotation marks.
    schema = Schema(messageID=ID(stored=True), content=TEXT(analyzer=analyzer))

    # create/load index (initialize if necessary)
    ix = __get_index(mbox, mbox_name, results_folder, schema, reindex)

    # extract artifacts from results folder
    artifacts = __get_artifacts(results_folder, files_as_artifacts)

    # parallelize execution call for the text search
    log.info("Start parsing...")
    num_cores = multiprocessing.cpu_count()
    csv_data = Parallel(n_jobs=num_cores - 1)(
        delayed(__parse_execute)(commit, schema, ix, include_filepath)
        for commit in artifacts)
    log.info("Parsing finished.")

    # re-arrange results
    result = []
    if not append_result:
        result.append(('file', 'artifact', 'messageID'))
    for entry in csv_data:
        for row in entry:
            result.append(row)

    # determine ouput file
    filename = "mboxparsing"
    if files_as_artifacts:
        filename += "_file"
    if include_filepath:
        filename += "_filepath.list"
    else:
        filename += ".list"
    output_file = os.path.join(results_folder, filename)

    # Writes found hits to file.
    log.info("Writing results to file {}.".format(output_file))
    csv_writer.write_to_csv(output_file, result, append=append_result)

    log.info("Parsing mbox file complete!")
def run():
    # get all needed paths and argument for the method call.
    parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github',
                                     description='Codeface extraction')
    parser.add_argument('-c',
                        '--config',
                        help="Codeface configuration file",
                        default='codeface.conf')
    parser.add_argument('-p',
                        '--project',
                        help="Project configuration file",
                        required=True)
    parser.add_argument('resdir',
                        help="Directory to store analysis results in")

    # parse arguments
    args = parser.parse_args(sys.argv[1:])
    __codeface_conf, __project_conf = map(os.path.abspath,
                                          (args.config, args.project))

    # create configuration
    __conf = Configuration.load(__codeface_conf, __project_conf)

    # get source and results folders
    __srcdir = os.path.abspath(
        os.path.join(args.resdir, __conf['repo'] + "_issues"))
    __resdir = os.path.abspath(
        os.path.join(args.resdir, __conf['project'], __conf["tagging"]))

    # run processing of issue data:
    # 1) load the list of issues
    issues = load(__srcdir)
    # 2) re-format the issues
    issues = reformat_issues(issues)
    # 3) merges all issue events into one list
    issues = merge_issue_events(issues)
    # 4) re-format the eventsList of the issues
    issues = reformat_events(issues)
    # 5) update user data with Codeface database
    issues = insert_user_data(issues, __conf)
    # 6) dump result to disk
    print_to_disk(issues, __resdir)
    print_to_disk_new(issues, __resdir)

    log.info("Github issue processing complete!")
def clear_result_files(results_folder):
    """
    Creates an empty csv file for every result file.

    :param results_folder: the folder where to save the result files
    """

    log.info("Clear result files ...")

    # construct list of path to output files
    output_files = [os.path.join(results_folder, "issues-jira.list"), os.path.join(results_folder, "bugs-jira.list"),
                    os.path.join(results_folder, "issue-jira.list"),
                    os.path.join(results_folder, "issues-jira-gephi-edges.csv"),
                    os.path.join(results_folder, "issues-jira-gephi-nodes.csv")]

    # creates empty csv files
    for output_file in output_files:
        open(output_file, "w+").close()
def load_xml(source_folder, xml_file):
    """
    Load issues from disk.

    :param source_folder: the folder where to .xml-file is in
    :param xml_file: the given xml-file
    :return: the loaded issue data
    """

    srcfile = os.path.join(source_folder, xml_file)
    log.devinfo("Loading issues from file '{}'...".format(srcfile))

    try:
        # parse the xml-file
        issue_data = parse(srcfile)
        return issue_data
    except Exception as e:
        log.info("Issue file " + format(srcfile) + " couldn't be opened because of a " + e.__class__.__name__)
        return None
def merge_user_with_user_from_csv(user, persons):
    """
    merges list of given users with list of already known users

    :param user: list of users to be merged
    :param persons: list of persons from JIRA (incl. e-mail addresses)
    :return: list of merged users
    """

    new_user = dict()
    if user["username"].lower() in persons.keys():
        new_user["username"] = unicode(user["username"].lower()).encode("utf-8")
        new_user["name"] = unicode(persons.get(user["username"].lower())[0]).encode("utf-8")
        new_user["email"] = unicode(persons.get(user["username"].lower())[1]).encode("utf-8")
    else:
        new_user = user
        log.warning("User not in csv-file: " + str(user))
    log.info("current User: "******",    new user: " + str(new_user))
    return new_user
Exemplo n.º 14
0
def print_to_disk_gephi(issues, results_folder):
    """
    Print issues to file "issues-jira-gephi-nodes.csv" and
    "issues-jira-gephi-edges.csv" in result folder. The files can be
     used to build dynamic networks in Gephi.

    :param issues: the issues to dump
    :param results_folder: the folder where to place the two output file
    """

    # construct path to output file
    output_file_nodes = os.path.join(results_folder,
                                     "issues-jira-gephi-nodes.csv")
    output_file_edges = os.path.join(results_folder,
                                     "issues-jira-gephi-edges.csv")
    log.info("Dumping output in file '{}'...".format(output_file_nodes))
    log.info("Dumping output in file '{}'...".format(output_file_edges))

    # construct lines of output
    node_lines = []
    edge_lines = []
    node_lines.append(("Id", "Type"))
    edge_lines.append(("Source", "Target", "Timestamp", "Edgetype"))
    for issue in issues:
        node_lines.append((issue["externalId"], "Issue"))
        node_lines.append((issue["author"]["name"], "Person"))

        edge_lines.append((issue["author"]["name"], issue["externalId"],
                           issue["creationDate"], "Person-Issue"))
        for comment in issue["comments"]:
            node_lines.append((comment["id"], "Comment"))
            node_lines.append((comment["author"]["name"], "Person"))

            edge_lines.append((issue["externalId"], comment["id"],
                               comment["changeDate"], "Issue-Comment"))
            edge_lines.append((comment["author"]["name"], comment["id"],
                               ["changeDate"], "Person-Comment"))
    # write to output file
    csv_writer.write_to_csv(output_file_edges, edge_lines, append=True)
    csv_writer.write_to_csv(output_file_nodes, node_lines, append=True)
def print_to_disk(issues, results_folder):
    """Print issues to file 'issues.list' in result folder

    :param issues: the issues to dump
    :param results_folder: the folder where to place 'issues.list' output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        for event in issue["eventsList"]:
            lines.append((issue["number"], issue["state"], issue["created_at"],
                          issue["closed_at"], issue["isPullRequest"],
                          event["user"]["name"], event["user"]["email"],
                          event["created_at"], "" if event["ref_target"] == ""
                          else event["ref_target"]["name"], event["event"]))

    # write to output file
    csv_writer.write_to_csv(output_file, lines)
Exemplo n.º 16
0
    def run(self, start_revision=None, end_revision=None):
        """
        Runs the extraction.

        :param start_revision: start of an release range (for range-level extractions)
        :param end_revision: end of an release range (for range-level extractions)
        """

        artifacts = self._tagging2artifacts[self.tagging]
        if not self.is_generic_extraction():
            artifacts = [artifacts[0]]

        for entity_type in artifacts:
            log.info("%s: %s to %s" %
                     (self.project,
                      self.__class__.__name__,
                      self._get_out_file(start_revision, end_revision, entity_type)
                      ))

            result = self._run_sql(end_revision, entity_type)
            lines = self._reduce_result(result)
            outfile = self._get_out_file(start_revision, end_revision, entity_type)
            self._write_export_file(lines, outfile)
Exemplo n.º 17
0
    def run(self, start_revision=None, end_revision=None):
        """
        Runs the extraction.

        :param start_revision: start of an release range (for range-level extractions)
        :param end_revision: end of an release range (for range-level extractions)
        """

        artifacts = self._tagging2artifacts[self.tagging]
        if not self.is_generic_extraction():
            artifacts = [artifacts[0]]

        for entity_type in artifacts:
            log.info("%s: %s to %s" %
                     (self.project,
                      self.__class__.__name__,
                      self._get_out_file(start_revision, end_revision, entity_type)
                      ))

            result = self._run_sql(end_revision, entity_type)
            lines = self._reduce_result(result)
            outfile = self._get_out_file(start_revision, end_revision, entity_type)
            self._write_export_file(lines, outfile)
def print_to_disk(issues, results_folder):
    """
    Print issues to file "issues.list" in result folder.
    This format is outdated but still used by the network library.
    TODO When the network library is updated, this method can be overwritten by "print_to_disk_new".

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues-github.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        for event in issue["eventsList"]:
            lines.append((
                issue["number"],
                issue["title"],
                json.dumps(issue["type"]),
                issue["state_new"],
                json.dumps(issue["resolution"]),
                issue["created_at"],
                issue["closed_at"],
                json.dumps([]),  # components
                event["event"],
                event["user"]["name"],
                event["user"]["email"],
                event["created_at"],
                event["event_info_1"],
                json.dumps(event["event_info_2"])
            ))

    # write to output file
    csv_writer.write_to_csv(output_file, lines)
def perform_data_backup(results_path, results_path_backup):
    """
    Copy the existing .list files of a certain directory (also recursively) to a separate backup folder.
    If the backup folder already exists, no files are copied, i.e., no backup is performed.

    :param results_path: the results dir, from which the data should be backuped
    :param results_path_backup: the results dir where the backup should be written to
    """

    if path.exists(results_path_backup):
        log.info("Backup folder already exists. No backup is to be performed.")
        return

    for filepath, dirnames, filenames in walk(results_path):
        for filename in filenames:
            if filename.endswith(".list"):
                current_file = path.join(filepath, filename)
                backup_file = path.join(results_path_backup,
                                        filepath[len(results_path) + 1:],
                                        filename)
                if not path.exists(path.dirname(backup_file)):
                    makedirs(path.dirname(backup_file))
                log.info("Backup %s to %s" % (current_file, backup_file))
                copy(current_file, backup_file)
def print_to_disk_new(issues, results_folder):
    """
    Print issues to file "issues_new.list" in result folder.
    This file has a consistent format to the "bugs-jira.list" file.
    TODO When the network library is updated, this is the format which shall be used.

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "new_format.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        for event in issue["eventsList"]:
            lines.append((
                issue["number"],
                issue["title"],
                issue["type"],
                issue["state_new"],
                issue["resolution"],
                issue["created_at"],
                issue["closed_at"],
                [],  # components
                event["event"],
                event["user"]["name"],
                event["user"]["email"],
                event["created_at"],
                event["event_info_1"],
                event["event_info_2"]))

    # write to output file
    csv_writer.write_to_csv(output_file, lines)
def run_extraction(conf, resdir):
    """
    Runs the extraction process for the list of given parameters.

    :param conf: the Codeface configuration object
    :param resdir: the Codeface results dir, where output files are written
    """

    log.info("%s: Extracting data" % conf["project"])

    # initialize database manager with given configuration
    dbm = DBManager(conf)

    # get all types of extractions, both project-level and range-level
    __extractions_project, __extractions_range = extractions.get_extractions(dbm, conf, resdir)

    # run project-level extractions
    for extraction in __extractions_project:
        extraction.run()

    # check if list of revisions in database is the same as in the config file
    revs = conf["revisions"]
    list_of_revisions = extractions.RevisionExtraction(dbm, conf, resdir).get_list()
    if revs:
        if set(revs) != set(list_of_revisions):
            log.error("List of revisions in configuration file do not match the list stored in the DB! Stopping now.")
            sys.exit(1)
        else:
            log.info("List of revisions in configuration file and DB match.")
    else:
        log.info("No list of revisions found in configuration file, using the list from the DB instead!")
        revs = list_of_revisions  # set list of revisions as stored in the database

    # for all revisions of this project
    for i in range(len(revs) - 1):
        start_rev = revs[i]
        end_rev = revs[i + 1]

        log.info("%s: Extracting data for version '%s'" % (conf["project"], end_rev))

        for extraction in __extractions_range:
            extraction.run(start_rev, end_rev)
def parse_xml(issue_data, persons, skip_history):
    """
    Parse issues from the xml-data

    :param issue_data: list of xml-files
    :param persons: list of persons from JIRA (incl. e-mail addresses)
    :param skip_history: flag if the history will be loaded in a different method
    :return: list of parsed issues
    """

    log.info("Parse jira issues...")
    issues = list()
    issuelist = issue_data.getElementsByTagName("item")
    # re-process all issues
    log.debug("Number of issues:" + str(len(issuelist)))
    for issue_x in issuelist:
        # temporary container for references
        comments = list()
        issue = dict()
        components = []

        # parse values form xml
        # add issue values to the issue
        key = issue_x.getElementsByTagName("key")[0]
        issue["id"] = key.attributes["id"].value
        issue["externalId"] = key.firstChild.data

        created = issue_x.getElementsByTagName("created")[0]
        createDate = created.firstChild.data
        issue["creationDate"] = format_time(createDate)

        resolved = issue_x.getElementsByTagName("resolved")
        issue["resolveDate"] = ""
        if (len(resolved) > 0) and (not resolved[0] is None):
            resolveDate = resolved[0].firstChild.data
            issue["resolveDate"] = format_time(resolveDate)

        title = issue_x.getElementsByTagName("title")[0]
        issue["title"] = title.firstChild.data

        link = issue_x.getElementsByTagName("link")[0]
        issue["url"] = link.firstChild.data

        type = issue_x.getElementsByTagName("type")[0]
        issue["type"] = type.firstChild.data
        # TODO new consistent format with GitHub issues. Not supported by the network library yet
        issue["type_new"] = ["issue", str(type.firstChild.data.lower())]

        status = issue_x.getElementsByTagName("status")[0]
        issue["state"] = status.firstChild.data
        # TODO new consistent format with GitHub issues. Not supported by the network library yet
        issue["state_new"] = status.firstChild.data.lower()

        project = issue_x.getElementsByTagName("project")[0]
        issue["projectId"] = project.attributes["id"].value

        resolution = issue_x.getElementsByTagName("resolution")[0]
        issue["resolution"] = resolution.firstChild.data
        # new consistent format with GitHub issues. Not supported by the network library yet
        issue["resolution_new"] = [str(resolution.firstChild.data.lower())]

        # consistency to default GitHub labels
        if issue["resolution"] == "Won't Fix":
            issue["resolution_new"] = ["wontfix"]

        # consistency to default GitHub labels
        if issue["resolution"] == "Won't Do":
            issue["resolution_new"] = ["wontdo"]

        for component in issue_x.getElementsByTagName("component"):
            components.append(str(component.firstChild.data))
        issue["components"] = components

        # if links are not loaded via api, they are added as a history event with less information
        if skip_history:
            issue["history"] = []
            for ref in issue_x.getElementsByTagName("issuelinktype"):
                history = dict()
                history["event"] = "add_link"
                history["author"] = create_user("", "", "")
                history["date"] = ""
                history["event_info_1"] = ref.getElementsByTagName("issuekey")[0].firstChild.data
                history["event_info_2"] = "issue"

                issue["history"].append(history)

        reporter = issue_x.getElementsByTagName("reporter")[0]
        user = create_user(reporter.firstChild.data, reporter.attributes["username"].value, "")
        issue["author"] = merge_user_with_user_from_csv(user, persons)

        issue["title"] = issue_x.getElementsByTagName("title")[0].firstChild.data

        # add comments / issue_changes to the issue
        for comment_x in issue_x.getElementsByTagName("comment"):
            comment = dict()
            comment["id"] = comment_x.attributes["id"].value
            user = create_user("", comment_x.attributes["author"].value, "")
            comment["author"] = merge_user_with_user_from_csv(user, persons)
            comment["state_on_creation"] = issue["state"]  # can get updated if history is retrieved
            comment["resolution_on_creation"] = issue["resolution"]  # can get updated if history is retrieved

            created = comment_x.attributes["created"].value
            comment["changeDate"] = format_time(created)

            comment["text"] = comment_x.firstChild.data
            comment["issueId"] = issue["id"]
            comments.append(comment)

        issue["comments"] = comments

        # add relations to the issue
        relations = list()
        for rel in issue_x.getElementsByTagName("issuelinktype"):
            relation = dict()
            relation["relation"] = rel.getElementsByTagName("name")[0].firstChild.data

            if rel.hasAttribute("inwardlinks"):
                left = rel.getElementsByTagName("inwardlinks")
                issuekeys = left.getElementsByTagName("issuekey")
                for key in issuekeys:
                    relation["type"] = "inward"
                    relation["id"] = key.firstChild.data
                    relations.append(relation)

            if rel.hasAttribute("outwardlinks"):
                right = rel.getElementsByTagName("outwardlinks")
                issuekeys = right.getElementsByTagName("issuekey")
                for key in issuekeys:
                    relation["type"] = "outward"
                    relation["id"] = key.firstChild.data
                    relations.append(relation)

        issue["relations"] = relations
        issues.append(issue)
    log.debug("number of issues after parse_xml: '{}'".format(len(issues)))
    return issues
def merge_issue_events(issue_data):
    """
    All issue events are merged together in the eventsList. This simplifies processing in later steps.

    :param issue_data: the issue data from which the events shall be merged
    :return: the issue data with merged eventsList
    """

    log.info("Merge issue events ...")

    for issue in issue_data:

        # temporary container for references
        comments = dict()

        # adds creation event to eventsList
        created_event = dict()
        created_event["user"] = issue["user"]
        created_event["created_at"] = issue["created_at"]
        created_event["event"] = "created"
        created_event["event_info_1"] = "open"
        created_event["event_info_2"] = []
        issue["eventsList"].append(created_event)
        issue["state_new"] = "open"

        # the format of every related issue is adjusted to the event format
        for rel_issue in issue["relatedIssues"]:
            rel_issue["created_at"] = format_time(rel_issue["referenced_at"])
            rel_issue["event"] = "add_link"
            rel_issue["event_info_1"] = rel_issue["number"]
            rel_issue["event_info_2"] = "issue"
            rel_issue["ref_target"] = ""

        # the format of every related commit is adjusted to the event format
        for rel_commit in issue["relatedCommits"]:

            # if the related commit has no time, it is a commit in the pull-request
            if rel_commit["referenced_at"] is None:
                rel_commit["user"] = create_user("", "", "")
                rel_commit["created_at"] = ""
                rel_commit["event"] = "has_commit"
                rel_commit["event_info_1"] = rel_commit["commit_id"]
                rel_commit["event_info_2"] = ""
                rel_commit["ref_target"] = ""
            # else it is a commit the issue/ pull-request refers to
            else:
                rel_commit["created_at"] = format_time(
                    rel_commit["referenced_at"])
                rel_commit["event"] = "add_link"
                rel_commit["event_info_1"] = rel_commit["commit_id"]
                rel_commit["event_info_2"] = "commit"
                rel_commit["ref_target"] = ""

        # the format of every comment is adjusted to the event format
        for comment in issue["commentsList"]:
            comment["event"] = "commented"
            comment["ref_target"] = ""
            comment["created_at"] = format_time(comment["referenced_at"])
            if "event_info_1" not in comment:
                comment["event_info_1"] = ""
            if "event_info_2" not in comment:
                comment["event_info_2"] = ""

            # cache comment by date to resolve/re-arrange references later
            comments[comment["created_at"]] = comment

        # the format of every event is adjusted
        for event in issue["eventsList"]:
            event["ref_target"] = ""
            event["created_at"] = format_time(event["created_at"])
            if "event_info_1" not in event:
                event["event_info_1"] = ""
            if "event_info_2" not in event:
                event["event_info_2"] = ""

            # if event collides with a comment
            if event["created_at"] in comments:
                comment = comments[event["created_at"]]
                # if someone gets mentioned or subscribed by someone else in a comment,
                # re-write the reference
                if (event["event"] == "mentioned" or event["event"] == "subscribed") and \
                                comment["event"] == "commented":
                    event["ref_target"] = event["user"]
                    event["user"] = comment["user"]

        # merge events, relatedCommits, relatedIssues and comment lists
        issue["eventsList"] = issue["commentsList"] + issue[
            "eventsList"] + issue["relatedIssues"] + issue["relatedCommits"]

        # remove events without user
        issue["eventsList"] = [
            event for event in issue["eventsList"]
            if not (event["user"] is None or event["ref_target"] is None)
        ]

        # sorts eventsList by time
        issue["eventsList"] = sorted(issue["eventsList"],
                                     key=lambda k: k["created_at"])

    return issue_data
def reformat_events(issue_data):
    """
    Re-format event information dependent on the event type.

    :param issue_data: the data of all issues that shall be re-formatted
    :return: the issue data with updated event information
    """

    log.info("Update event information ...")

    for issue in issue_data:

        # re-format information of every event in the eventsList of an issue
        for event in issue["eventsList"]:

            if event["event"] == "closed":
                event["event"] = "state_updated"
                event["event_info_1"] = "closed"  # new state
                event["event_info_2"] = "open"  # old state
                issue["state_new"] = "closed"

            elif event["event"] == "reopened":
                event["event"] = "state_updated"
                event["event_info_1"] = "open"  # new state
                event["event_info_2"] = "closed"  # old state
                issue["state_new"] = "reopened"

            elif event["event"] == "labeled":
                label = event["label"]["name"].lower()
                event["event_info_1"] = label

                # if the label is in this list, it also is a type of the issue
                if label in known_types:
                    issue["type"].append(str(label))

                    # creates an event for type updates and adds it to the eventsList
                    type_event = dict()
                    type_event["user"] = event["user"]
                    type_event["created_at"] = event["created_at"]
                    type_event["event"] = "type_updated"
                    type_event["event_info_1"] = label
                    type_event["event_info_2"] = ""
                    type_event["ref_target"] = ""
                    issue["eventsList"].append(type_event)

                # if the label is in this list, it also is a resolution of the issue
                elif label in known_resolutions:
                    issue["resolution"].append(str(label))

                    # creates an event for resolution updates and adds it to the eventsList
                    resolution_event = dict()
                    resolution_event["user"] = event["user"]
                    resolution_event["created_at"] = event["created_at"]
                    resolution_event["event"] = "resolution_updated"
                    resolution_event["event_info_1"] = label
                    resolution_event["event_info_2"] = ""
                    resolution_event["ref_target"] = ""
                    issue["eventsList"].append(resolution_event)

            elif event["event"] == "unlabeled":
                label = event["label"]["name"].lower()
                event["event_info_1"] = label

                # if the label is in this list, it also is a type of the issue
                if label in known_types:
                    issue["type"].remove(str(label))

                    # creates an event for type updates and adds it to the eventsList
                    type_event = dict()
                    type_event["user"] = event["user"]
                    type_event["created_at"] = event["created_at"]
                    type_event["event"] = "type_updated"
                    type_event["event_info_1"] = ""
                    type_event["event_info_2"] = label
                    type_event["ref_target"] = ""
                    issue["eventsList"].append(type_event)

                    # if the label is in this list, it also is a resolution of the issue
                elif label in known_resolutions:
                    issue["resolution"].remove(str(label))

                    # creates an event for resolution updates and adds it to the eventsList
                    resolution_event = dict()
                    resolution_event["user"] = event["user"]
                    resolution_event["created_at"] = event["created_at"]
                    resolution_event["event"] = "resolution_updated"
                    resolution_event["event_info_1"] = ""
                    resolution_event["event_info_2"] = label
                    resolution_event["ref_target"] = ""
                    issue["eventsList"].append(resolution_event)

            elif event["event"] == "commented":
                # "state_new" and "resolution" of the issue give the information about the state and the resolution of
                # the issue when the comment was written, because the eventsList is sorted by time
                event["event_info_1"] = issue["state_new"]
                event["event_info_2"] = issue["resolution"]

    return issue_data
Exemplo n.º 25
0
def load_issue_via_api(issues, persons, url):
    """
    For each issue in the list the history is added via the api.

    :param issues: list of issues
    :param persons: list of persons from JIRA (incl. e-mail addresses), see function "load_csv"
    :param url: the project url
    """

    log.info("Load issue information via api...")
    jira_project = JIRA(url)

    global jira_request_counter

    for issue in issues:

        # if the number of JIRA requests has reached the request limit, wait 24 hours
        if jira_request_counter > max_requests:
            log.info(
                "More than " + str(max_requests) +
                " JIRA requests have already been sent. Wait for 24 hours...")
            sleep(86500)  # 60 * 60 * 24 = 86400
            log.info("Reset JIRA request counter and proceed...")
            jira_request_counter = 0

        try:
            # send JIRA request for current issues and increase request counter
            jira_request_counter += 1
            log.info("JIRA request counter: " + str(jira_request_counter))
            api_issue = jira_project.issue(issue["externalId"],
                                           expand="changelog")
            changelog = api_issue.changelog
        except JIRAError:
            log.warn("JIRA Error: Changelog cannot be extracted for issue " +
                     issue["externalId"] + ". History omitted!")
            changelog = None

        histories = list()

        # adds the issue creation time with the default state to a list
        # list is needed to find out the state the issue had when a comment was written
        state_changes = [[issue["creationDate"], "open"]]

        # adds the issue creation time with the default resolution to a list
        # list is needed to find out the resolution the issue had when a comment was written
        resolution_changes = [[issue["creationDate"], "unresolved"]]

        # only consider history changes if we were able to extract the changelog for the current issue
        if changelog is not None:

            # history changes get visited in time order from oldest to newest
            for change in changelog.histories:

                # default values for state and resolution
                old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved"

                # all changes in the issue changelog are checked if they contain a useful information
                for item in change.items:

                    # state_updated event gets created and added to the issue history
                    if item.field == "status":
                        if item.fromString is not None:
                            old_state = item.fromString.lower()
                        if item.toString is not None:
                            new_state = item.toString.lower()
                        history = dict()
                        history["event"] = "state_updated"
                        history["event_info_1"] = new_state
                        history["event_info_2"] = old_state
                        if hasattr(change, "author"):
                            user = create_user(change.author.displayName,
                                               change.author.name, "")
                        else:
                            log.warn("No author for history: " +
                                     str(change.id) + " created at " +
                                     str(change.created))
                            user = create_user("", "", "")
                        history["author"] = merge_user_with_user_from_csv(
                            user, persons)
                        history["date"] = format_time(change.created)
                        histories.append(history)
                        state_changes.append([history["date"], new_state])

                    # resolution_updated event gets created and added to the issue history
                    elif item.field == "resolution":
                        if item.fromString is not None:
                            old_resolution = item.fromString.lower()
                        if item.toString is not None:
                            new_resolution = item.toString.lower()
                        history = dict()
                        history["event"] = "resolution_updated"
                        history["event_info_1"] = new_resolution
                        history["event_info_2"] = old_resolution
                        if hasattr(change, "author"):
                            user = create_user(change.author.displayName,
                                               change.author.name, "")
                        else:
                            log.warn("No author for history: " +
                                     str(change.id) + " created at " +
                                     str(change.created))
                            user = create_user("", "", "")
                        history["author"] = merge_user_with_user_from_csv(
                            user, persons)
                        history["date"] = format_time(change.created)
                        histories.append(history)
                        resolution_changes.append(
                            [history["date"], new_resolution])

                    # assigned event gets created and added to the issue history
                    elif item.field == "assignee":
                        history = dict()
                        history["event"] = "assigned"
                        user = create_user(change.author.displayName,
                                           change.author.name, "")
                        history["author"] = merge_user_with_user_from_csv(
                            user, persons)
                        assignee = create_user(item.toString, item.to, "")
                        assigned_user = merge_user_with_user_from_csv(
                            assignee, persons)
                        history["event_info_1"] = assigned_user["name"]
                        history["event_info_2"] = assigned_user["email"]
                        history["date"] = format_time(change.created)
                        histories.append(history)

                    elif item.field == "Link":
                        # add_link event gets created and added to the issue history
                        if item.toString is not None:
                            history = dict()
                            history["event"] = "add_link"
                            user = create_user(change.author.displayName,
                                               change.author.name, "")
                            history["author"] = merge_user_with_user_from_csv(
                                user, persons)
                            # api returns a text. The issueId is at the end of the text and gets extracted
                            history["event_info_1"] = item.toString.split()[-1]
                            history["event_info_2"] = "issue"
                            history["date"] = format_time(change.created)
                            histories.append(history)

                        # remove_link event gets created and added to the issue history
                        if item.fromString is not None:
                            history = dict()
                            history["event"] = "remove_link"
                            user = create_user(change.author.displayName,
                                               change.author.name, "")
                            history["author"] = merge_user_with_user_from_csv(
                                user, persons)
                            # api returns a text. Th issue id is at the end of the text and gets extracted
                            history["event_info_1"] = item.fromString.split(
                            )[-1]
                            history["event_info_2"] = "issue"
                            history["date"] = format_time(change.created)
                            histories.append(history)

        # state and resolution change lists get sorted by time
        state_changes.sort(key=lambda x: x[0])
        resolution_changes.sort(key=lambda x: x[0])

        for comment in issue["comments"]:

            # the state the issue had when the comment was written is searched out
            for state in state_changes:
                if comment["changeDate"] > state[0]:
                    comment["state_on_creation"] = state[1]

            # the resolution the issue had when the comment was written is searched out
            for resolution in resolution_changes:
                if comment["changeDate"] > resolution[0]:
                    comment["resolution_on_creation"] = [str(resolution[1])]

        issue["history"] = histories
def print_to_disk_bugs(issues, results_folder):
    """
    Sorts of bug issues and prints them to file "bugs-jira.list" in result folder
    This method prints in a new format which is consistent to the format of "print_to_disk_new" in "issue_processing.py".
    TODO When the network library is updated this format shall be used in all print to disk methods.

    :param issues: the issues to sort of bugs
    :param results_folder: the folder where to place "bugs-jira.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "bugs-jira.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        log.info("Current issue '{}'".format(issue["externalId"]))

        # only writes issues with type bug and their comments in the output file
        if "bug" in issue["type_new"]:
            lines.append((
                issue["externalId"],
                issue["title"],
                issue["type_new"],
                issue["state_new"],
                issue["resolution_new"],
                issue["creationDate"],
                issue["resolveDate"],
                issue["components"],
                "created",  ## event.name
                issue["author"]["name"],
                issue["author"]["email"],
                issue["creationDate"],
                "open",  ## default state when created
                ["unresolved"]  ## default resolution when created
            ))

            lines.append((
                issue["externalId"],
                issue["title"],
                issue["type_new"],
                issue["state_new"],
                issue["resolution_new"],
                issue["creationDate"],
                issue["resolveDate"],
                issue["components"],
                "commented",
                issue["author"]["name"],
                issue["author"]["email"],
                issue["creationDate"],
                "open",  ##  default state when created
                ["unresolved"]  ## default resolution when created
            ))

            for comment in issue["comments"]:
                lines.append((
                    issue["externalId"],
                    issue["title"],
                    issue["type_new"],
                    issue["state_new"],
                    issue["resolution_new"],
                    issue["creationDate"],
                    issue["resolveDate"],
                    issue["components"],
                    "commented",
                    comment["author"]["name"],
                    comment["author"]["email"],
                    comment["changeDate"],
                    comment["state_on_creation"],
                    comment["resolution_on_creation"]
                ))

            for history in issue["history"]:
                lines.append((
                    issue["externalId"],
                    issue["title"],
                    issue["type_new"],
                    issue["state_new"],
                    issue["resolution_new"],
                    issue["creationDate"],
                    issue["resolveDate"],
                    issue["components"],
                    history["event"],
                    history["author"]["name"],
                    history["author"]["email"],
                    history["date"],
                    history["event_info_1"],
                    history["event_info_2"]
                ))

    # write to output file
    csv_writer.write_to_csv(output_file, lines, append=True)
Exemplo n.º 27
0
def run():
    # get all needed paths and arguments for the method call.
    parser = argparse.ArgumentParser(prog="codeface-extraction-issues-jira",
                                     description="Codeface extraction")
    parser.add_argument("-c",
                        "--config",
                        help="Codeface configuration file",
                        default="codeface.conf")
    parser.add_argument("-p",
                        "--project",
                        help="Project configuration file",
                        required=True)
    parser.add_argument("resdir",
                        help="Directory to store analysis results in")
    parser.add_argument(
        "-s",
        "--skip_history",
        help=
        "Skip methods that retrieve additional history information from the configured JIRA"
        +
        "server. This decreases the runtime and shuts off the external connection",
        action="store_true")

    # parse arguments
    args = parser.parse_args(sys.argv[1:])
    __codeface_conf, __project_conf = map(os.path.abspath,
                                          (args.config, args.project))

    # create configuration
    __conf = Configuration.load(__codeface_conf, __project_conf)

    # get source and results folders
    __srcdir = os.path.abspath(
        os.path.join(args.resdir, __conf["repo"] + "_proximity", "conway",
                     "issues_xml"))
    __resdir = os.path.abspath(
        os.path.join(args.resdir, __conf["project"], __conf["tagging"]))
    __srcdir_csv = os.path.abspath(
        os.path.join(args.resdir, __conf["repo"] + "_proximity", "conway"))

    # get person folder
    # __psrcdir = os.path.abspath(os.path.join(args.resdir, __conf["repo"] + "_proximity", "conway"))

    # load the list of persons
    persons = load_csv(__srcdir_csv)

    # load the xml-file list
    file_list = [
        f for f in os.listdir(__srcdir)
        if os.path.isfile(os.path.join(__srcdir, f))
    ]

    # creates empty result files
    clear_result_files(__resdir)

    # list for malformed or missing xml-files
    incorrect_files = []

    # processes every xml-file
    for current_file in file_list:
        # 1) load the list of issues
        issues = load_xml(__srcdir, current_file)
        # if an error occurred while loading the xml-file
        if issues is None:
            incorrect_files.append(current_file)
            continue
        # 2) re-format the issues
        issues = parse_xml(issues, persons, args.skip_history)
        # 3) load issue information via api
        if not args.skip_history:
            load_issue_via_api(issues, persons, __conf["issueTrackerURL"])
        # 4) update user data with Codeface database
        #    ATTENTION: As the database update is performed for every iteration in this for loop, but the current issue
        #    data is appended to the results file immediately, the database updates from the later iterations are not
        #    respected in the previously dumped issues from the previous iterations. However, as we don't get email
        #    data from JIRA, this is currently not a problem, as no names will change in the database if we don't
        #    provide emails. If JIRA will provide email data in the future, this implementation needs to be adjusted
        #    in such a way that users in issue data of all iterations are updated in the end and dumped afterwards,
        #    instead of dumping the intermediate issue data immediately.
        issues = insert_user_data(issues, __conf)
        # 5) dump result to disk
        print_to_disk(issues, __resdir)
        # # 6) export for Gephi
        # print_to_disk_gephi(issues, __resdir)
        # # 7) export for jira issue extraction to use them in dev-network-growth
        # print_to_disk_extr(issues, __resdir)
        # 8) dump bug issues to disk
        print_to_disk_bugs(issues, __resdir)

    log.info("Jira issue processing complete!")
    log.info("In total, " + str(jira_request_counter) +
             " requests have been sent to Jira.")

    if incorrect_files:
        log.info("Following files where malformed or not existing:: " +
                 str(incorrect_files))
def run():
    # get all needed paths and argument for the method call.
    parser = argparse.ArgumentParser(prog="codeface-extraction-issues-jira", description="Codeface extraction")
    parser.add_argument("-c", "--config", help="Codeface configuration file", default="codeface.conf")
    parser.add_argument("-p", "--project", help="Project configuration file", required=True)
    parser.add_argument("resdir", help="Directory to store analysis results in")
    parser.add_argument("-s", "--skip_history",
                        help="Skip methods that retrieve additional history information from the configured JIRA" +
                             "server. This decreases the runtime and shuts off the external connection",
                        action="store_true")

    # parse arguments
    args = parser.parse_args(sys.argv[1:])
    __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project))

    # create configuration
    __conf = Configuration.load(__codeface_conf, __project_conf)

    # get source and results folders
    __srcdir = os.path.abspath(os.path.join(args.resdir, __conf["repo"] + "_proximity", "conway", "issues_xml"))
    __resdir = os.path.abspath(os.path.join(args.resdir, __conf["project"], __conf["tagging"]))
    __srcdir_csv = os.path.abspath(os.path.join(args.resdir, __conf["repo"] + "_proximity", "conway"))

    # get person folder
    # __psrcdir = os.path.abspath(os.path.join(args.resdir, __conf["repo"] + "_proximity", "conway"))

    # load the list of persons
    persons = load_csv(__srcdir_csv)

    # load the xml-file list
    file_list = [f for f in os.listdir(__srcdir) if os.path.isfile(os.path.join(__srcdir, f))]

    # creates empty result files
    clear_result_files(__resdir)

    # list for malformed or missing xml-files
    incorrect_files = []

    # processes every xml-file
    for current_file in file_list:
        # 1) load the list of issues
        issues = load_xml(__srcdir, current_file)
        # if an error occurred while loading the xml-file
        if issues is None:
            incorrect_files.append(current_file)
            continue
        # 2) re-format the issues
        issues = parse_xml(issues, persons, args.skip_history)
        # 3) load issue information via api
        if not args.skip_history:
            load_issue_via_api(issues, persons, __conf["issueTrackerURL"])
        # 4) update user data with Codeface database
        # mabye not nessecary
        issues = insert_user_data(issues, __conf)
        # 5) dump result to disk
        print_to_disk(issues, __resdir)
        # 6) export for Gephi
        print_to_disk_gephi(issues, __resdir)
        # 7) export for jira issue extraction to use them in dev-network-growth
        print_to_disk_extr(issues, __resdir)
        # 8) dump bug issues to disk
        print_to_disk_bugs(issues, __resdir)

    log.info("Jira issue processing complete!")

    if incorrect_files:
        log.info("Following files where malformed or not existing:: " + str(incorrect_files))
def insert_user_data(issues, conf):
    """
    Insert user data into database ad update issue data.

    :param issues: the issues to retrieve user data from
    :param conf: the project configuration
    :return: the updated issue data
    """

    log.info("Syncing users with ID service...")

    # create buffer for users
    user_buffer = dict()
    # open database connection
    dbm = DBManager(conf)
    # open ID-service connection
    idservice = idManager(dbm, conf)

    def get_user_string(name, email):
        if not email or email is None:
            return "{name}".format(name=name)
            # return "{name} <{name}@default.com>".format(name=name)  # for debugging only
        else:
            return "{name} <{email}>".format(name=name, email=email)

    def get_or_update_user(user, buffer_db=user_buffer):
        # fix encoding for name and e-mail address
        if user["name"] is not None:
            name = unicode(user["name"]).encode("utf-8")
        else:
            name = unicode(user["username"]).encode("utf-8")
        mail = unicode(user["email"]).encode("utf-8")
        # construct string for ID service and send query
        user_string = get_user_string(name, mail)

        # check buffer to reduce amount of DB queries
        if user_string in buffer_db:
            log.devinfo("Returning user '{}' from buffer.".format(user_string))
            return buffer_db[user_string]

        # get person information from ID service
        log.devinfo("Passing user '{}' to ID service.".format(user_string))
        idx = idservice.getPersonID(user_string)

        # update user data with person information from DB
        person = idservice.getPersonFromDB(idx)
        user["email"] = person["email1"]  # column "email1"
        user["name"] = person["name"]  # column "name"
        user["id"] = person["id"]  # column "id"

        # add user information to buffer
        # user_string = get_user_string(user["name"], user["email"]) # update for
        buffer_db[user_string] = user

        return user

    for issue in issues:
        # check database for issue author
        issue["user"] = get_or_update_user(issue["user"])

        # check database for event authors
        for event in issue["eventsList"]:
            # get the event user from the DB
            event["user"] = get_or_update_user(event["user"])

            # get the reference-target user from the DB if needed
            if event["ref_target"] != "":
                event["ref_target"] = get_or_update_user(event["ref_target"])
                event["event_info_1"] = event["ref_target"]["name"]
                event["event_info_2"] = event["ref_target"]["email"]

    return issues
def insert_user_data(issues, conf):
    """Insert user data into database ad update issue data.

    :param issues: the issues to retrieve user data from
    :param conf: the project configuration
    :return: the updated issue data
    """

    log.info("Syncing users with ID service...")

    # create buffer for users
    user_buffer = dict()
    # open database connection
    dbm = DBManager(conf)
    # open ID-service connection
    idservice = idManager(dbm, conf)

    def get_user_string(name, email):
        if not email or email is None:
            return "{name}".format(name=name)
            # return "{name} <{name}@default.com>".format(name=name)  # for debugging only
        else:
            return "{name} <{email}>".format(name=name, email=email)

    def get_or_update_user(user, buffer_db=user_buffer):
        # fix encoding for name and e-mail address
        if user["name"] is not None:
            name = unicode(user["name"]).encode("utf-8")
        else:
            name = unicode(user["username"]).encode("utf-8")
        mail = unicode(user["email"]).encode("utf-8")
        # construct string for ID service and send query
        user_string = get_user_string(name, mail)

        # check buffer to reduce amount of DB queries
        if user_string in buffer_db:
            log.devinfo("Returning user '{}' from buffer.".format(user_string))
            return buffer_db[user_string]

        # get person information from ID service
        log.devinfo("Passing user '{}' to ID service.".format(user_string))
        idx = idservice.getPersonID(user_string)

        # update user data with person information from DB
        person = idservice.getPersonFromDB(idx)
        user["email"] = person["email1"]  # column 'email1'
        user["name"] = person["name"]  # column 'name'
        user["id"] = person["id"]  # column 'id'

        # add user information to buffer
        # user_string = get_user_string(user["name"], user["email"]) # update for
        buffer_db[user_string] = user

        return user

    for issue in issues:
        # check database for issue author
        issue["user"] = get_or_update_user(issue["user"])

        # check database for event authors
        for event in issue["eventsList"]:
            # get the event user from the DB
            event["user"] = get_or_update_user(event["user"])
            # get the reference-target user from the DB if needed
            if event["ref_target"] != "":
                event["ref_target"] = get_or_update_user(event["ref_target"])

    return issues
Exemplo n.º 31
0
def insert_user_data(issues, conf):
    """
    Insert user data into database and update issue data.

    :param issues: the issues to retrieve user data from
    :param conf: the project configuration
    :return: the updated issue data
    """

    log.info("Syncing users with ID service...")

    # create buffer for users (key: user id)
    user_buffer = dict()
    # create buffer for user ids (key: user string)
    user_id_buffer = dict()
    # open database connection
    dbm = DBManager(conf)
    # open ID-service connection
    idservice = idManager(dbm, conf)

    def get_user_string(name, email):
        if not email or email is None:
            return "{name}".format(name=name)
            # return "{name} <{name}@default.com>".format(name=name)  # for debugging only
        else:
            return "{name} <{email}>".format(name=name, email=email)

    def get_id_and_update_user(user, buffer_db_ids=user_id_buffer):
        # fix encoding for name and e-mail address
        if user["name"] is not None and user["name"] != "":
            name = unicode(user["name"]).encode("utf-8")
        else:
            name = unicode(user["username"]).encode("utf-8")
        mail = unicode(user["email"]).encode("utf-8")  # empty
        # construct string for ID service and send query
        user_string = get_user_string(name, mail)

        # check buffer to reduce amount of DB queries
        if user_string in buffer_db_ids:
            log.devinfo(
                "Returning person id for user '{}' from buffer.".format(
                    user_string))
            return buffer_db_ids[user_string]

        # get person information from ID service
        log.devinfo("Passing user '{}' to ID service.".format(user_string))
        idx = idservice.getPersonID(user_string)

        # add user information to buffer
        # user_string = get_user_string(user["name"], user["email"]) # update for
        buffer_db_ids[user_string] = idx

        return idx

    def get_user_from_id(idx, buffer_db=user_buffer):

        # check whether user information is in buffer to reduce amount of DB queries
        if idx in buffer_db:
            log.devinfo("Returning user '{}' from buffer.".format(idx))
            return buffer_db[idx]

        # get person information from ID service
        log.devinfo("Passing user id '{}' to ID service.".format(idx))
        person = idservice.getPersonFromDB(idx)
        user = dict()
        user["email"] = person["email1"]  # column "email1"
        user["name"] = person["name"]  # column "name"
        user["id"] = person["id"]  # column "id"

        # add user information to buffer
        buffer_db[idx] = user

        return user

    # check and update database for all occurring users
    for issue in issues:
        # check database for issue author
        issue["author"] = get_id_and_update_user(issue["author"])

        # check database for comment authors
        for comment in issue["comments"]:
            comment["author"] = get_id_and_update_user(comment["author"])

        # check database for event authors in the history
        for event in issue["history"]:
            event["author"] = get_id_and_update_user(event["author"])

            # check database for target user if needed
            if event["event"] == "assigned":
                assigned_user = get_id_and_update_user(
                    create_user(event["event_info_1"], "",
                                event["event_info_2"]))
                event["event_info_1"] = assigned_user

    # get all users after database updates having been performed
    for issue in issues:
        # get issue author
        issue["author"] = get_user_from_id(issue["author"])

        # get comment authors
        for comment in issue["comments"]:
            comment["author"] = get_user_from_id(comment["author"])

        # get event authors for non-comment events
        for event in issue["history"]:
            event["author"] = get_user_from_id(event["author"])

            # get target user if needed
            if event["event"] == "assigned":
                assigned_user = get_user_from_id(event["event_info_1"])
                event["event_info_1"] = assigned_user["name"]
                event["event_info_2"] = assigned_user["email"]

    log.debug("number of issues after insert_user_data: '{}'".format(
        len(issues)))
    return issues
Exemplo n.º 32
0
def print_to_disk(issues, results_folder):
    """
    Print issues to file "issues-jira.list" in result folder.

    :param issues: the issues to dump
    :param results_folder: the folder where to place "issues-jira.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "issues-jira.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        log.info("Current issue '{}'".format(issue["externalId"]))

        # add the creation event
        lines.append((
            issue["externalId"],
            issue["title"],
            json.dumps(issue["type_list"]),
            issue["state_new"],
            json.dumps(issue["resolution_list"]),
            issue["creationDate"],
            issue["resolveDate"],
            json.dumps(issue["components"]),
            "created",  ## event.name
            issue["author"]["name"],
            issue["author"]["email"],
            issue["creationDate"],
            "open",  ## default state when created
            json.dumps(["unresolved"])  ## default resolution when created
        ))

        # add an additional commented event for the creation
        lines.append((
            issue["externalId"],
            issue["title"],
            json.dumps(issue["type_list"]),
            issue["state_new"],
            json.dumps(issue["resolution_list"]),
            issue["creationDate"],
            issue["resolveDate"],
            json.dumps(issue["components"]),
            "commented",
            issue["author"]["name"],
            issue["author"]["email"],
            issue["creationDate"],
            "open",  ##  default state when created
            json.dumps(["unresolved"])  ## default resolution when created
        ))

        # add comment events
        for comment in issue["comments"]:
            lines.append(
                (issue["externalId"], issue["title"],
                 json.dumps(issue["type_list"]), issue["state_new"],
                 json.dumps(issue["resolution_list"]),
                 issue["creationDate"], issue["resolveDate"],
                 json.dumps(issue["components"]), "commented",
                 comment["author"]["name"], comment["author"]["email"],
                 comment["changeDate"], comment["state_on_creation"],
                 json.dumps(comment["resolution_on_creation"])))

        # add history events
        for history in issue["history"]:
            lines.append(
                (issue["externalId"], issue["title"],
                 json.dumps(issue["type_list"]), issue["state_new"],
                 json.dumps(issue["resolution_list"]),
                 issue["creationDate"], issue["resolveDate"],
                 json.dumps(issue["components"]), history["event"],
                 history["author"]["name"], history["author"]["email"],
                 history["date"], history["event_info_1"],
                 json.dumps(history["event_info_2"])))

    # write to output file
    csv_writer.write_to_csv(output_file, lines, append=True)
Exemplo n.º 33
0
def print_to_disk_bugs(issues, results_folder):
    """
    Extract bug issues and prints them to file "bugs-jira.list" in result folder.
    This method prints in a format which is consistent to the format of "print_to_disk" in "issue_processing.py".

    :param issues: the issues to sort of bugs
    :param results_folder: the folder where to place "bugs-jira.list" output file
    """

    # construct path to output file
    output_file = os.path.join(results_folder, "bugs-jira.list")
    log.info("Dumping output in file '{}'...".format(output_file))

    # construct lines of output
    lines = []
    for issue in issues:
        log.info("Current issue '{}'".format(issue["externalId"]))

        # only write issues with type bug and their comments in the output file
        if "bug" in issue["type_list"]:

            # add the creation event
            lines.append((
                issue["externalId"],
                issue["title"],
                json.dumps(issue["type_list"]),
                issue["state_new"],
                json.dumps(issue["resolution_list"]),
                issue["creationDate"],
                issue["resolveDate"],
                json.dumps(issue["components"]),
                "created",  ## event.name
                issue["author"]["name"],
                issue["author"]["email"],
                issue["creationDate"],
                "open",  ## default state when created
                json.dumps(["unresolved"])  ## default resolution when created
            ))

            # add an additional commented event for the creation
            lines.append((
                issue["externalId"],
                issue["title"],
                json.dumps(issue["type_list"]),
                issue["state_new"],
                json.dumps(issue["resolution_list"]),
                issue["creationDate"],
                issue["resolveDate"],
                json.dumps(issue["components"]),
                "commented",
                issue["author"]["name"],
                issue["author"]["email"],
                issue["creationDate"],
                "open",  ##  default state when created
                json.dumps(["unresolved"])  ## default resolution when created
            ))

            # add comment events
            for comment in issue["comments"]:
                lines.append(
                    (issue["externalId"], issue["title"],
                     json.dumps(issue["type_list"]), issue["state_new"],
                     json.dumps(issue["resolution_list"]),
                     issue["creationDate"], issue["resolveDate"],
                     json.dumps(issue["components"]), "commented",
                     comment["author"]["name"], comment["author"]["email"],
                     comment["changeDate"], comment["state_on_creation"],
                     json.dumps(comment["resolution_on_creation"])))

            # add history events
            for history in issue["history"]:
                lines.append(
                    (issue["externalId"], issue["title"],
                     json.dumps(issue["type_list"]), issue["state_new"],
                     json.dumps(issue["resolution_list"]),
                     issue["creationDate"], issue["resolveDate"],
                     json.dumps(issue["components"]), history["event"],
                     history["author"]["name"], history["author"]["email"],
                     history["date"], history["event_info_1"],
                     json.dumps(history["event_info_2"])))

    # write to output file
    csv_writer.write_to_csv(output_file, lines, append=True)
Exemplo n.º 34
0
def run_anonymization(conf, resdir):
    """
    Runs the anonymization process for the given parameters, that is, replaces names, e-mail addresses, message ids,
    and issue titles with pseudonymized contents in all .list files in resdir.
    Writes the anonymized .list files to another directory (resdir + "_threemonth").

    :param conf: the Codeface configuration object
    :param resdir: the Codeface results dir, where result files are read from
    """

    authors_list = "authors.list"
    commits_list = "commits.list"
    emails_list = "emails.list"
    issues_github_list = "issues-github.list"
    issues_jira_list = "issues-jira.list"
    bugs_jira_list = "bugs-jira.list"
    bots_list = "bots.list"
    gender_list = "gender.list"
    revisions_list = "revisions.list"  # not to be anonymized, only to be copied to the "anonymized" directory

    # When looking at elements originating from json lists, we need to consider quotation marks around the string
    quot_m = "\""

    data_path = path.join(resdir, conf["project"], conf["tagging"])
    anonymize_path = path.join((resdir + "_anonymized"), conf["project"],
                               conf["tagging"])
    if not path.exists(anonymize_path):
        log.info("Create directory %s", anonymize_path)
        makedirs(anonymize_path)

    log.info("%s: Anonymize authors." % conf["project"])

    # create dictionaries to store mappings from authors to anonymized authors and titles to anonymized titles
    author_to_anonymized_author = dict()
    author_to_anonymized_author_gender = dict()
    i = 0
    i_gender = 0
    title_to_anonymized_title = dict()
    k = 0
    """
    Helper function to anonymize author data (i.e., data from the authors.list file).

    :param author_data: the author data to be anonymized (must have been read via "csv_writer.read_from_csv")
    :param i: counter for anonymized developer names (i.e., its current start value which has not been used yet)
    :param author_to_anonymized_author: dictionary in which to lookup and store mappings from (name, e-mail) pairs
                                        to anonymized (name, e-mail) pairs for the developers
    :param name_only: whether also the name (without e-mail) should be used as key for the dictionary
                      "author_to_anonymized_author". This is necessary if there might be lookups using
                      auto-generated and, therefore, different e-mail addresses for the same name.
    :return: the anonymized "author_data",
             the current value of "i" (which has not been used yet),
             and the updated dictionary "author_to_anonymized_author"
    """
    def anonymize_authors(author_data,
                          i,
                          author_to_anonymized_author,
                          name_only=False):

        for author in author_data:
            orig_author = author[1]
            orig_email = author[2]

            # Don't anonymize the deleted user as this one might be needed for filtering (but add it to the dictionary)
            if orig_author == "Deleted user" and orig_email == "*****@*****.**":
                if not (orig_author,
                        orig_email) in author_to_anonymized_author:
                    author_to_anonymized_author[(orig_author,
                                                 orig_email)] = (orig_author,
                                                                 orig_email)
            else:
                # check whether (name, e-mail) pair isn't already present in the dictionary
                if not (orig_author,
                        orig_email) in author_to_anonymized_author:
                    # check if just the name (without e-mail address) isn't already present in the dictionary
                    if not orig_author in author_to_anonymized_author:
                        # if the author has an empty name, only anonymize their e-mail address
                        if not author[1] == "":
                            author[1] = ("developer" + str(i))
                        author[2] = ("mail" + str(i) + "@dev.org")

                        # add new entry to dictionary (using (name, e-mail) pair as key)
                        author_to_anonymized_author[(orig_author,
                                                     orig_email)] = (author[1],
                                                                     author[2])
                        # if we allow name-only entries, also add an additional entry to dictionary
                        if name_only:
                            author_to_anonymized_author[orig_author] = (
                                author[1], author[2])

                        # increment counter as we have generated a new anonymized developer id
                        i += 1
                    else:
                        # as just the name (without e-mail address) is present in the dictionary, make a lookup
                        # for the name only and add a new entry to the dictionary using (name, e-mail) pair
                        author_new = author_to_anonymized_author[orig_author]
                        author_to_anonymized_author[(
                            orig_author, orig_email)] = (author_new[0],
                                                         author_new[1])
                        author[1] = author_new[0]
                        author[2] = author_new[1]
                else:
                    # as the (name, e-mail) pair is present in the dictionary, just make a lookup for the pair
                    author_new = author_to_anonymized_author[(orig_author,
                                                              orig_email)]
                    author[1] = author_new[0]
                    author[2] = author_new[1]

        return author_data, i, author_to_anonymized_author

    # Check for all files in the result directory of the project whether they need to be anonymized
    for filepath, dirnames, filenames in walk(data_path):

        # (1) Anonymize authors lists
        if authors_list in filenames:
            f = path.join(filepath, authors_list)
            log.info("Anonymize %s ...", f)
            author_data = csv_writer.read_from_csv(f)
            author_data_gender = csv_writer.read_from_csv(f)

            # check if tagging is "feature"
            if conf["tagging"] == "feature":
                # as tagging is "feature", we need to check for the proximity data to keep anonymized ids consistent
                # over both feature and proximity data

                # if corresponding proximity data exists, read authors from proximity data and use them for
                # anonymization to make anonymized proximity data and feature data consistent
                f_proximity = f.replace("feature", "proximity")
                if path.isfile(f_proximity):
                    log.info(
                        "Read authors from %s and anonymize them (without dumping to file).",
                        f_proximity)
                    author_data_proximity = csv_writer.read_from_csv(
                        f_proximity)

                    # anonymize authors from proximity data (but just add them to our dictionary, to be used below
                    # for the actual anonymization of the feature data)
                    author_data_proximity, i, author_to_anonymized_author = \
                      anonymize_authors(author_data_proximity, i, author_to_anonymized_author, name_only = True)

            # anonymize authors
            author_data, i, author_to_anonymized_author = \
              anonymize_authors(author_data, i, author_to_anonymized_author)

            author_data_gender, i_gender, author_to_anonymized_author_gender = \
              anonymize_authors(author_data_gender, i_gender, author_to_anonymized_author_gender, name_only = True)

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, author_data)

        # (2) Anonymize commits lists
        if commits_list in filenames:
            f = path.join(filepath, commits_list)
            log.info("Anonymize %s ...", f)
            commit_data = csv_writer.read_from_csv(f)

            for commit in commit_data:
                # anonymize author
                new_author = author_to_anonymized_author[(commit[2],
                                                          commit[3])]
                commit[2] = new_author[0]
                commit[3] = new_author[1]
                # anonymize committer
                new_committer = author_to_anonymized_author[(commit[5],
                                                             commit[6])]
                commit[5] = new_committer[0]
                commit[6] = new_committer[1]

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(output_path)
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, commit_data)

        # (3) Anonymize emails lists
        if emails_list in filenames:
            f = path.join(filepath, emails_list)
            log.info("Anonymize %s ...", f)
            email_data = csv_writer.read_from_csv(f)

            j = 0

            for email in email_data:
                # anonymize author
                new_author = author_to_anonymized_author[(email[0], email[1])]
                email[0] = new_author[0]
                email[1] = new_author[1]
                # anonymize message id
                email[2] = ("<message" + str(j) + "@message.dev.org>")
                j += 1

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, email_data)

        # (4) Anonymize issues lists (github)
        if issues_github_list in filenames:
            f = path.join(filepath, issues_github_list)
            log.info("Anonymize %s ...", f)
            issue_data = csv_writer.read_from_csv(f)

            for issue_event in issue_data:
                # anonymize author
                new_author = author_to_anonymized_author[(issue_event[9],
                                                          issue_event[10])]
                issue_event[9] = new_author[0]
                issue_event[10] = new_author[1]
                # anonymize person in event info 1/2
                if (issue_event[12],
                        issue_event[13][1:-1]) in author_to_anonymized_author:
                    new_person = author_to_anonymized_author[(
                        issue_event[12], issue_event[13][1:-1])]
                    issue_event[12] = new_person[0]
                    issue_event[13] = quot_m + new_person[1] + quot_m
                # anonymize issue title
                if issue_event[1] in title_to_anonymized_title:
                    issue_event[1] = title_to_anonymized_title[issue_event[1]]
                else:
                    new_title = ("issue-title-" + str(k))
                    title_to_anonymized_title[issue_event[1]] = new_title
                    issue_event[1] = new_title
                    k += 1

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, issue_data)

        # (5) Anonymize issues lists (jira)
        if issues_jira_list in filenames:
            f = path.join(filepath, issues_jira_list)
            log.info("Anonymize %s ...", f)
            issue_data = csv_writer.read_from_csv(f)

            for issue_event in issue_data:
                # anonymize author
                new_author = author_to_anonymized_author[(issue_event[9],
                                                          issue_event[10])]
                issue_event[9] = new_author[0]
                issue_event[10] = new_author[1]
                # anonymize person in event info 1/2
                if (issue_event[12],
                        issue_event[13][1:-1]) in author_to_anonymized_author:
                    new_person = author_to_anonymized_author[(
                        issue_event[12], issue_event[13][1:-1])]
                    issue_event[12] = new_person[0]
                    issue_event[13] = quot_m + new_person[1] + quot_m
                # anonymize issue title
                if issue_event[1] in title_to_anonymized_title:
                    issue_event[1] = title_to_anonymized_title[issue_event[1]]
                else:
                    new_title = ("issue-title-" + str(k))
                    title_to_anonymized_title[issue_event[1]] = new_title
                    issue_event[1] = new_title
                    k += 1

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, issue_data)

        # (6) Anonymize bugs lists (jira)
        if bugs_jira_list in filenames:
            f = path.join(filepath, bugs_jira_list)
            log.info("Anonymize %s ...", f)
            bug_data = csv_writer.read_from_csv(f)

            for bug_event in bug_data:
                # anonymize author
                new_author = author_to_anonymized_author[(bug_event[9],
                                                          bug_event[10])]
                bug_event[9] = new_author[0]
                bug_event[10] = new_author[1]
                # anonymize person in event info 1/2
                if (issue_event[12],
                        issue_event[13][1:-1]) in author_to_anonymized_author:
                    new_person = author_to_anonymized_author[(
                        bug_event[12], bug_event[13][1:-1])]
                    bug_event[12] = new_person[0]
                    bug_event[13] = quot_m + new_person[1] + quot_m
                # anonymize bug title
                if bug_event[1] in title_to_anonymized_title:
                    bug_event[1] = title_to_anonymized_title[bug_event[1]]
                else:
                    new_title = ("issue-title-" + str(k))
                    title_to_anonymized_title[bug_event[1]] = new_title
                    bug_event[1] = new_title
                    k += 1

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, bug_data)

        # (7) Anonymize bots list
        if bots_list in filenames:
            f = path.join(filepath, bots_list)
            log.info("Anonymize %s ...", f)
            bot_data = csv_writer.read_from_csv(f)

            for bot in bot_data:
                new_person = author_to_anonymized_author[(bot[0], bot[1])]
                bot[0] = new_person[0]
                bot[1] = new_person[1]

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, bot_data)

        # (8) Anonymize gender list
        if gender_list in filenames:
            f = path.join(filepath, gender_list)
            log.info("Anonymize %s ...", f)
            gender_data = csv_writer.read_from_csv(f)
            gender_data_new = []

            for author in gender_data:
                if author[0] in author_to_anonymized_author_gender.keys():
                    new_person = author_to_anonymized_author_gender[author[0]]
                    author[0] = new_person[0]
                    gender_data_new.append(author)

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Write anonymized data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, gender_data_new)

        # (9) Copy revisions list
        if revisions_list in filenames:
            f = path.join(filepath, revisions_list)
            log.info("Copy %s ...", f)
            revision_data = csv_writer.read_from_csv(f)

            output_path = f.replace(data_path, anonymize_path)
            if not path.exists(path.dirname(output_path)):
                makedirs(path.dirname(output_path))
            log.info("Copy revision data to %s ...", output_path)
            csv_writer.write_to_csv(output_path, revision_data)

    log.info("Anonymization complete!")
def merge_issue_events(issue_data):
    """
    All issue events are merged together in the eventsList. This simplifies processing in later steps.

    :param issue_data: the issue data from which the events shall be merged
    :return: the issue data with merged eventsList
    """

    log.info("Merge issue events ...")

    for issue in issue_data:

        # temporary container for references
        comments = dict()

        # adds creation event to eventsList
        created_event = dict()
        created_event["user"] = issue["user"]
        created_event["created_at"] = issue["created_at"]
        created_event["event"] = "created"
        created_event["event_info_1"] = "open"
        created_event["event_info_2"] = []
        issue["eventsList"].append(created_event)
        issue["state_new"] = "open"

        # the format of every related issue is adjusted to the event format
        for rel_issue in issue["relatedIssues"]:
            rel_issue["created_at"] = format_time(rel_issue["referenced_at"])
            rel_issue["event"] = "add_link"
            rel_issue["event_info_1"] = rel_issue["number"]
            rel_issue["event_info_2"] = "issue"
            rel_issue["ref_target"] = ""

        # the format of every related commit is adjusted to the event format
        for rel_commit in issue["relatedCommits"]:

            # if the related commit has no time, it is a commit in the pull-request
            if rel_commit["referenced_at"] is None:
                rel_commit["user"] = create_user("", "", "")
                rel_commit["created_at"] = ""
                rel_commit["event"] = "has_commit"
                rel_commit["event_info_1"] = rel_commit["commit_id"]
                rel_commit["event_info_2"] = ""
                rel_commit["ref_target"] = ""
            # else it is a commit the issue/ pull-request refers to
            else:
                rel_commit["created_at"] = format_time(rel_commit["referenced_at"])
                rel_commit["event"] = "add_link"
                rel_commit["event_info_1"] = rel_commit["commit_id"]
                rel_commit["event_info_2"] = "commit"
                rel_commit["ref_target"] = ""

        # the format of every comment is adjusted to the event format
        for comment in issue["commentsList"]:
            comment["event"] = "commented"
            comment["ref_target"] = ""
            comment["created_at"] = format_time(comment["referenced_at"])
            if "event_info_1" not in comment:
                comment["event_info_1"] = ""
            if "event_info_2" not in comment:
                comment["event_info_2"] = ""

            # cache comment by date to resolve/re-arrange references later
            comments[comment["created_at"]] = comment

        # the format of every event is adjusted
        for event in issue["eventsList"]:
            event["ref_target"] = ""
            event["created_at"] = format_time(event["created_at"])
            if "event_info_1" not in event:
                event["event_info_1"] = ""
            if "event_info_2" not in event:
                event["event_info_2"] = ""

            # if event collides with a comment
            if event["created_at"] in comments:
                comment = comments[event["created_at"]]
                # if someone gets mentioned or subscribed by someone else in a comment,
                # re-write the reference
                if (event["event"] == "mentioned" or event["event"] == "subscribed") and \
                                comment["event"] == "commented":
                    event["ref_target"] = event["user"]
                    event["user"] = comment["user"]

        # merge events, relatedCommits, relatedIssues and comment lists
        issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[
            "relatedCommits"]

        # remove events without user
        issue["eventsList"] = [event for event in issue["eventsList"] if
                               not (event["user"] is None or event["ref_target"] is None)]

        # sorts eventsList by time
        issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])

    return issue_data
def reformat_events(issue_data):
    """
    Re-format event information dependent on the event type.

    :param issue_data: the data of all issues that shall be re-formatted
    :return: the issue data with updated event information
    """

    log.info("Update event information ...")

    for issue in issue_data:

        # re-format information of every event in the eventsList of an issue
        for event in issue["eventsList"]:

            if event["event"] == "closed":
                event["event"] = "state_updated"
                event["event_info_1"] = "closed"  # new state
                event["event_info_2"] = "open"  # old state
                issue["state_new"] = "closed"

            elif event["event"] == "reopened":
                event["event"] = "state_updated"
                event["event_info_1"] = "open"  # new state
                event["event_info_2"] = "closed"  # old state
                issue["state_new"] = "reopened"

            elif event["event"] == "labeled":
                label = event["label"]["name"].lower()
                event["event_info_1"] = label

                # if the label is in this list, it also is a type of the issue
                if label in known_types:
                    issue["type"].append(str(label))

                    # creates an event for type updates and adds it to the eventsList
                    type_event = dict()
                    type_event["user"] = event["user"]
                    type_event["created_at"] = event["created_at"]
                    type_event["event"] = "type_updated"
                    type_event["event_info_1"] = label
                    type_event["event_info_2"] = ""
                    type_event["ref_target"] = ""
                    issue["eventsList"].append(type_event)

                # if the label is in this list, it also is a resolution of the issue
                elif label in known_resolutions:
                    issue["resolution"].append(str(label))

                    # creates an event for resolution updates and adds it to the eventsList
                    resolution_event = dict()
                    resolution_event["user"] = event["user"]
                    resolution_event["created_at"] = event["created_at"]
                    resolution_event["event"] = "resolution_updated"
                    resolution_event["event_info_1"] = label
                    resolution_event["event_info_2"] = ""
                    resolution_event["ref_target"] = ""
                    issue["eventsList"].append(resolution_event)

            elif event["event"] == "unlabeled":
                label = event["label"]["name"].lower()
                event["event_info_1"] = label

                # if the label is in this list, it also is a type of the issue
                if label in known_types:
                    issue["type"].remove(str(label))

                    # creates an event for type updates and adds it to the eventsList
                    type_event = dict()
                    type_event["user"] = event["user"]
                    type_event["created_at"] = event["created_at"]
                    type_event["event"] = "type_updated"
                    type_event["event_info_1"] = ""
                    type_event["event_info_2"] = label
                    type_event["ref_target"] = ""
                    issue["eventsList"].append(type_event)

                    # if the label is in this list, it also is a resolution of the issue
                elif label in known_resolutions:
                    issue["resolution"].remove(str(label))

                    # creates an event for resolution updates and adds it to the eventsList
                    resolution_event = dict()
                    resolution_event["user"] = event["user"]
                    resolution_event["created_at"] = event["created_at"]
                    resolution_event["event"] = "resolution_updated"
                    resolution_event["event_info_1"] = ""
                    resolution_event["event_info_2"] = label
                    resolution_event["ref_target"] = ""
                    issue["eventsList"].append(resolution_event)

            elif event["event"] == "commented":
                # "state_new" and "resolution" of the issue give the information about the state and the resolution of
                # the issue when the comment was written, because the eventsList is sorted by time
                event["event_info_1"] = issue["state_new"]
                event["event_info_2"] = str(issue["resolution"])

    return issue_data
def load_issue_via_api(issues, persons, url):
    """
    For each issue in the list the history is added via the api

        :param issues: list of issues
        :param persons: list of persons from JIRA (incl. e-mail addresses)
        :param url: the project url
    """

    log.info("Load issue information via api...")
    jira_project = JIRA(url)

    for issue in issues:

        api_issue = jira_project.issue(issue["externalId"], expand="changelog")
        changelog = api_issue.changelog
        histories = list()

        # adds the issue creation time with the default state to an list
        # list is needed to find out the state the issue had when a comment was written
        state_changes = [[issue["creationDate"], "open"]]

        # adds the issue creation time with the default resolution to an list
        # list is needed to find out the resolution the issue had when a comment was written
        resolution_changes = [[issue["creationDate"], "unresolved"]]

        # history changes get visited in time order from oldest to newest
        for change in changelog.histories:

            # default values for state and resolution
            old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved"

            # all changes in the issue changelog are checked if they contain an useful information
            for item in change.items:

                # state_updated event gets created and added to the issue history
                if item.field == "status":
                    if item.fromString is not None:
                        old_state = item.fromString.lower()
                    if item.toString is not None:
                        new_state = item.toString.lower()
                    history = dict()
                    history["event"] = "state_updated"
                    history["event_info_1"] = new_state
                    history["event_info_2"] = old_state
                    user = create_user(change.author.name, change.author.name, "")
                    history["author"] = merge_user_with_user_from_csv(user, persons)
                    history["date"] = format_time(change.created)
                    histories.append(history)
                    state_changes.append([history["date"], new_state])

                # resolution_updated event gets created and added to the issue history
                elif item.field == "resolution":
                    if item.fromString is not None:
                        old_resolution = item.fromString.lower()
                    if item.toString is not None:
                        new_resolution = item.toString.lower()
                    history = dict()
                    history["event"] = "resolution_updated"
                    history["event_info_1"] = new_resolution
                    history["event_info_2"] = old_resolution
                    user = create_user(change.author.name, change.author.name, "")
                    history["author"] = merge_user_with_user_from_csv(user, persons)
                    history["date"] = format_time(change.created)
                    histories.append(history)
                    resolution_changes.append([history["date"], new_resolution])

                # assigned event gets created and added to the issue history
                elif item.field == "assignee":
                    history = dict()
                    history["event"] = "assigned"
                    user = create_user(change.author.name, change.author.name, "")
                    history["author"] = merge_user_with_user_from_csv(user, persons)
                    assignee = create_user(item.toString, item.toString, "")
                    assigned_user = merge_user_with_user_from_csv(assignee, persons)
                    history["event_info_1"] = assigned_user["name"]
                    history["event_info_2"] = assigned_user["email"]
                    history["date"] = format_time(change.created)
                    histories.append(history)

                elif item.field == "Link":
                    # add_link event gets created and added to the issue history
                    if item.toString is not None:
                        history = dict()
                        history["event"] = "add_link"
                        user = create_user(change.author.name, change.author.name, "")
                        history["author"] = merge_user_with_user_from_csv(user, persons)
                        # api returns a text. The issueId is at the end of the text and gets extracted
                        history["event_info_1"] = item.toString.split()[-1]
                        history["event_info_2"] = "issue"
                        history["date"] = format_time(change.created)
                        histories.append(history)

                    # remove_link event gets created and added to the issue history
                    if item.fromString is not None:
                        history = dict()
                        history["event"] = "remove_link"
                        user = create_user(change.author.name, change.author.name, "")
                        history["author"] = merge_user_with_user_from_csv(user, persons)
                        # api returns a text. Th issue id is at the end of the text and gets extracted
                        history["event_info_1"] = item.fromString.split()[-1]
                        history["event_info_2"] = "issue"
                        history["date"] = format_time(change.created)
                        histories.append(history)

        # state and resolution change lists get sorted by time
        state_changes.sort(key=lambda x: x[0])
        resolution_changes.sort(key=lambda x: x[0])

        for comment in issue["comments"]:

            # the state the issue had when the comment was written is searched out
            for state in state_changes:
                if comment["changeDate"] > state[0]:
                    comment["state_on_creation"] = state[1]

            # the resolution the issue had when the comment was written is searched out
            for resolution in resolution_changes:
                if comment["changeDate"] > resolution[0]:
                    comment["resolution_on_creation"] = [str(resolution[1])]

        issue["history"] = histories