class Code2DbMain(): """ This class handles the import of code information """ NUM_PROCESSES = 5 def __init__(self, db_name, project_name, repo_name, git_repo_path, import_type, extensions, references, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of the Git repository to import :type git_repo_path: str :param git_repo_path: the local path of the Git repository :type import_type: int :param import_type: 1 = import overall function statistics per file, 2 = import function-level information :type extensions: list str :param extensions: file extensions to analyse. Gitana calculates loc, comments and blank lines for most of the files. For the following languages ['java', 'py', 'php', 'scala', 'js', 'rb', 'cs', 'cpp', 'c'], Gitana also provides insights about ccn, functions and tokens. :type references: list str :param references: list of references to analyse :type num_processes: int :param num_processes: number of processes to import the data (default 10) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-code-" + db_name + "-" + project_name + "-" + repo_name self._git_repo_path = git_repo_path self._project_name = project_name self._db_name = db_name self._repo_name = repo_name self._import_type = import_type self._extensions = extensions self._references = references if num_processes: self._num_processes = num_processes else: self._num_processes = Code2DbMain.NUM_PROCESSES config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_commit_file_pairs(self, repo_id): pairs = [] filter_references = "1 = 1" if self._references: filter_references = "r.name IN (" + ",".join(["'" + e + "'" for e in self._references]) + ")" filter_extensions = "1 = 1" if self._extensions: filter_extensions = "f.ext IN (" + ",".join(["'" + e + "'" for e in self._extensions]) + ")" cursor = self._dao.get_cursor() query = "SELECT c.id AS commit_id, c.sha, f.id AS file_id, f.name AS file_name, f.ext AS file_ext " \ "FROM commit_in_reference cin JOIN reference r ON r.id = cin.ref_id " \ "JOIN commit c ON c.id = cin.commit_id " \ "JOIN file_modification fm ON fm.commit_id = c.id " \ "JOIN file f ON f.id = fm.file_id " \ "WHERE " + filter_references + " AND " + filter_extensions + " AND cin.repo_id = %s " \ "GROUP BY c.id, f.id" arguments = [repo_id] self._dao.execute(cursor, query, arguments) row = self._dao.fetchone(cursor) while row: pairs.append({"commit_id": row[0], "commit_sha": row[1], "file_id": row[2], "file_name": row[3], "file_ext": row[4]}) row = self._dao.fetchone(cursor) self._dao.close_cursor(cursor) return pairs def _get_info_code(self, repo_id): pairs = self._get_commit_file_pairs(repo_id) intervals = [i for i in multiprocessing_util.get_tasks_intervals(pairs, self._num_processes) if len(i) > 0] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results) for interval in intervals: issue_extractor = Code2DbCommitFile(self._db_name, self._git_repo_path, interval, self._import_type, self._config, self._log_path) queue_intervals.put(issue_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def extract(self): """ extracts code function data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info") self._logger.info("Code2DbMain started") start_time = datetime.now() self._querier = GitQuerier(self._git_repo_path, self._logger) self._dao = GitDao(self._config, self._logger) repo_id = self._dao.select_repo_id(self._repo_name) self._get_info_code(repo_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("Code2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except Exception: self._logger.error("Code2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class GraphExporter(): """ This class exports the Gitana data to a graph representation """ LOG_FOLDER_PATH = "logs" INPUT_PATH = os.path.join(os.path.dirname(resources.__file__), 'queries.json') def __init__(self, config, db_name, log_root_path): """ :type config: dict :param config: the DB configuration file :type db_name: str :param config: name of an existing DB :type log_root_path: str :param log_root_path: the log path """ self._db_util = DbUtil() self._dsl_util = DslUtil() self._logging_util = LoggingUtil() self._log_path = log_root_path + "export-graph-" + db_name + ".log" self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._db_name = db_name self._config = config self._cnx = self._db_util.get_connection(self._config) self._db_util.set_database(self._cnx, self._db_name) self._db_util.set_settings(self._cnx) def _create_log_folder(self, name): # creates the log folder if not os.path.exists(name): os.makedirs(name) def _create_output_file(self, filename): # creates the output folder if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise def _load_graph_exporter_json(self, json_path): # load the JSON that drives the graph exporter process with open(json_path) as json_data: data = json.load(json_data) return data.get('graph') def _get_parameter(self, key, parameters): # get JSON parameters found = None if key in ["EDGECOLOR", "NODECOLOR"]: found = parameters.get(key.lower()) else: if key.endswith("ID"): name = parameters.get(key[:-2].lower()) found = self._dsl_util.find_entity_id(self._cnx, key[:-2].lower(), name, self._logger) if not found: self._logger.error("GraphExporter: parameter " + str(key) + " not found!") return found def _load_query_json(self, metric_name, parameters): # loads the query stored in the JSON file with open(GraphExporter.INPUT_PATH) as json_data: data = json.load(json_data) metrics = data.get('queries') try: found = [m for m in metrics if m.get('name') == metric_name][0] nodes_query = found.get('nodes') edges_query = found.get('edges') for k in found.keys(): if k not in ['name', 'edges', 'nodes']: k_value = str(self._get_parameter(k, parameters)) nodes_query = nodes_query.replace(k, k_value) edges_query = edges_query.replace(k, k_value) return (nodes_query, edges_query) except: self._logger.error("GraphExporter: metric " + str(metric_name) + " not found!") def export(self, file_path, json_path): """ exports the Gitana data to a graph :type file_path: str :param file_path: the path where to export the graph :type json_path: str :param json_path: the path of the JSON that drives the export process """ # gtype -> graph type = "undirected", "directed", if null "undirected" # gmode -> graph mode = "dynamic", "static", if null "dynamic" try: self._logger.info("GraphExporter started") start_time = datetime.now() exporter_data = self._load_graph_exporter_json(json_path) metric_name = exporter_data.get("name") parameters = exporter_data.get("params") (nodes_query, edges_query) = self._load_query_json(metric_name, parameters) gexf = GexfGenerator(self._cnx, self._logger) gexf.create(nodes_query, edges_query, parameters.get("type"), file_path) self._db_util.close_connection(self._cnx) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("GraphExporter: process finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("GraphExporter failed", exc_info=True)
class Slack2DbUpdate(): """ This class handles the update of Slack data """ def __init__(self, db_name, project_name, instant_messaging_name, tokens, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type instant_messaging_name: str :param instant_messaging_name: the name of an existing instant messaging in the DB to update :type tokens: list of tokens :param tokens: a list of Slack tokens :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "update-slack-" + db_name + "-" + project_name + "-" + instant_messaging_name self._project_name = project_name self._db_name = db_name self._instant_messaging_name = instant_messaging_name self._tokens = tokens config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _update_channels(self, instant_messaging_id): #updates channels of a instant messaging channel_ids = self._dao.get_channel_ids(instant_messaging_id) if channel_ids: intervals = [ i for i in multiprocessing_util.get_tasks_intervals( channel_ids, len(self._tokens)) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) for i in range(len(intervals)): channel_extractor = SlackChannel2Db( self._db_name, instant_messaging_id, intervals[i], self._tokens[i], self._config, self._log_path) queue_extractors.put(channel_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def update(self): """ updates the Slack data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("SlackUpdate started") start_time = datetime.now() self._querier = SlackQuerier(self._tokens[0], self._logger) self._dao = SlackDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) instant_messaging_id = self._dao.select_instant_messaging_id( self._instant_messaging_name, project_id) if instant_messaging_id: self._update_channels(instant_messaging_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("SlackDbUpdate extract finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("SlackDbUpdate extract failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class Code2DbUpdate(): """ This class handles the update of code data """ NUM_PROCESSES = 5 def __init__(self, db_name, project_name, repo_name, git_repo_path, extensions, references, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of the Git repository to import :type git_repo_path: str :param git_repo_path: the local path of the Git repository :type extensions: list str :param extensions: file extensions to analyse. Currently extensions supported: ['java', 'py', 'php', 'scala', 'js', 'rb', 'cs', 'cpp', 'c'] :type references: list str :param references: list of references to analyse :type num_processes: int :param num_processes: number of processes to import the data (default 5) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "update-code-" + db_name + "-" + project_name + "-" + repo_name self._git_repo_path = git_repo_path self._project_name = project_name self._db_name = db_name self._repo_name = repo_name self._extensions = extensions self._references = references if num_processes: self._num_processes = num_processes else: self._num_processes = Code2DbUpdate.NUM_PROCESSES config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_new_commit_file_pairs(self, repo_id): pairs = [] filter_references = "1 = 1" if self._references: filter_references = "r.name IN (" + ",".join( ["'" + e + "'" for e in self._references]) + ")" filter_extensions = "1 = 1" if self._extensions: filter_extensions = "f.ext IN (" + ",".join( ["'" + e + "'" for e in self._extensions]) + ")" cursor = self._dao.get_cursor() query = "SELECT existing_pairs.* " \ "FROM ( " \ "SELECT cac.commit_id, cac.file_id FROM code_at_commit cac GROUP BY cac.commit_id, cac.file_id) AS processed_pairs " \ "RIGHT JOIN " \ "(SELECT c.id as commit_id, c.sha, f.id AS file_id, f.name AS file_name, f.ext AS file_ext " \ "FROM commit_in_reference cin JOIN reference r ON r.id = cin.ref_id " \ "JOIN commit c ON c.id = cin.commit_id " \ "JOIN file_modification fm ON fm.commit_id = c.id " \ "JOIN file f ON f.id = fm.file_id " \ "WHERE " + filter_references + " AND " + filter_extensions + " AND cin.repo_id = %s " \ "GROUP BY c.id, f.id) AS existing_pairs " \ "ON processed_pairs.commit_id = existing_pairs.commit_id AND processed_pairs.file_id = existing_pairs.file_id " \ "WHERE processed_pairs.commit_id IS NULL" arguments = [repo_id] self._dao.execute(cursor, query, arguments) row = self._dao.fetchone(cursor) while row: pairs.append({ "commit_id": row[0], "commit_sha": row[1], "file_id": row[2], "file_name": row[3], "file_ext": row[4] }) row = self._dao.fetchone(cursor) self._dao.close_cursor(cursor) return pairs def _update_existing_references(self, repo_id, import_type): pairs = self._get_new_commit_file_pairs(repo_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( pairs, self._num_processes) if len(i) > 0 ] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, intervals, results) for interval in intervals: issue_extractor = Code2DbCommitFile(self._db_name, self._git_repo_path, interval, import_type, self._config, self._log_path) queue_intervals.put(issue_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _update_info_code(self, repo_id, import_type): #updates code data self._update_existing_references(repo_id, import_type) def _get_import_type(self, repo_id): #gets import type import_type = 0 import_type += self._dao.function_at_commit_is_empty( repo_id) + self._dao.code_at_commit_is_empty(repo_id) return import_type def update(self): """ updates the Git data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("Code2DbUpdate started") start_time = datetime.now() self._querier = GitQuerier(self._git_repo_path, self._logger) self._dao = GitDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) repo_id = self._dao.select_repo_id(self._repo_name) self._update_info_code(repo_id, self._get_import_type(repo_id)) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("Code2DbUpdate finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("Code2DbUpdate failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class DbSchema(): """ This class initializes the DB schema """ def __init__(self, db_name, config, log_root_path): """ :type db_name: str :param db_name: the name of the DB to initialize/connect to, it cannot be null and must follow the format allowed in MySQL (http://dev.mysql.com/doc/refman/5.7/en/identifiers.html). If a DB having a name equal already exists in Gitana, the existing DB will be dropped and a new one will be created :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._db_name = db_name self._config = config self._log_root_path = log_root_path self._db_util = DbUtil() self._logging_util = LoggingUtil() log_path = self._log_root_path + "db-schema-" + db_name self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") self._cnx = self._db_util.get_connection(self._config) def __del__(self): if self._cnx: self._db_util.close_connection(self._cnx) if self._logger: #deletes the file handler of the logger self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) def add_git_tables(self): """ initializes git tables if they do not exist """ self.set_database(self._db_name) self._init_git_tables() def add_issue_tracker_tables(self): """ initializes issue tracker tables if they do not exist """ self.set_database(self._db_name) self._init_shared_tables_issue_tracker_communication_channels() self._init_issue_tracker_tables() def add_instant_messaging_tables(self): """ initializes instant messaging tables if they do not exist """ self.set_database(self._db_name) self._init_shared_tables_issue_tracker_communication_channels() self._init_instant_messaging_tables() def add_forum_tables(self): """ initializes forum tables if they do not exist """ self.set_database(self._db_name) self._init_shared_tables_issue_tracker_communication_channels() self._init_forum_tables() def init_database(self, init_git, init_issue_tracker, init_forum, init_instant_messaging): """ initializes the database tables and functions :type init_git: bool :param init_git: if True, it initializes the tables containing git data :type init_issue_tracker: bool :param init_issue_tracker: if True, it initializes the tables containing issue tracker data :type init_forum: bool :param init_forum: if True, it initializes the tables containing forum data :type init_instant_messaging: bool :param init_instant_messaging: if True, it initializes the tables containing instant messaging data """ try: self._logger.info("init database started") start_time = datetime.now() self._create_database() self.set_database(self._db_name) self._set_settings() self._init_common_tables() if init_issue_tracker or init_forum or init_instant_messaging: self._init_shared_tables_issue_tracker_communication_channels() if init_git: self._init_git_tables() if init_issue_tracker: self._init_issue_tracker_tables() if init_forum: self._init_forum_tables() if init_instant_messaging: self._init_instant_messaging_tables() self._init_functions() self._logger.info("database " + self._db_name + " created") end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("Init database finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") except Exception: self._logger.error("init database failed", exc_info=True) def create_project(self, project_name): """ inserts a project in the DB :type project_name: str :param project_name: the name of the project to create """ self._cnx = self._db_util.get_connection(self._config) self._db_util.insert_project(self._cnx, self._db_name, project_name) self._db_util.close_connection(self._cnx) def create_repository(self, project_name, repo_name): """ inserts a repository in the DB :type project_name: str :param project_name: the name of an existing project :type repo_name: str :param repo_name: the name of the repository to insert """ self._cnx = self._db_util.get_connection(self._config) self.set_database(self._db_name) project_id = self._db_util.select_project_id(self._cnx, project_name, self._logger) try: self._db_util.insert_repo(self._cnx, project_id, repo_name, self._logger) except Exception: self._logger.error("repository " + repo_name + " not inserted", exc_info=True) self._db_util.close_connection(self._cnx) def list_projects(self): """ lists all projects contained in the DB """ self._cnx = self._db_util.get_connection(self._config) project_names = [] self.set_database(self._db_name) cursor = self._cnx.cursor() query = "SELECT name FROM project" cursor.execute(query) row = cursor.fetchone() while row: project_names.append(row[0]) row = cursor.fetchone() cursor.close() return project_names def set_database(self, db_name): """ sets the DB used by the tool :type db_name: str :param db_name: the name of the DB """ try: self._logger.info("set database " + db_name + " started") self._db_util.set_database(self._cnx, db_name) self._logger.info("set database " + db_name + " finished") except Exception: self._logger.error("set database failed", exc_info=True) def _set_settings(self): #sets the settings (max connections, charset, file format, ...) used by the DB self._db_util.set_settings(self._cnx) def _create_database(self): #creates the database cursor = self._cnx.cursor() drop_database_if_exists = "DROP DATABASE IF EXISTS " + self._db_name cursor.execute(drop_database_if_exists) create_database = "CREATE DATABASE " + self._db_name cursor.execute(create_database) cursor.close() def _init_functions(self): #initializes functions cursor = self._cnx.cursor() levenshtein_distance = """ CREATE DEFINER=`root`@`localhost` FUNCTION `levenshtein_distance`(s1 VARCHAR(255) CHARACTER SET utf8, s2 VARCHAR(255) CHARACTER SET utf8) RETURNS int(11) DETERMINISTIC BEGIN DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT; DECLARE s1_char CHAR CHARACTER SET utf8; -- max strlen=255 for this function DECLARE cv0, cv1 VARBINARY(256); SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0; IF (s1 = s2) THEN RETURN (0); ELSEIF (s1_len = 0) THEN RETURN (s2_len); ELSEIF (s2_len = 0) THEN RETURN (s1_len); END IF; WHILE (j <= s2_len) DO SET cv1 = CONCAT(cv1, CHAR(j)), j = j + 1; END WHILE; WHILE (i <= s1_len) DO SET s1_char = SUBSTRING(s1, i, 1), c = i, cv0 = CHAR(i), j = 1; WHILE (j <= s2_len) DO SET c = c + 1, cost = IF(s1_char = SUBSTRING(s2, j, 1), 0, 1); SET c_temp = ORD(SUBSTRING(cv1, j, 1)) + cost; IF (c > c_temp) THEN SET c = c_temp; END IF; SET c_temp = ORD(SUBSTRING(cv1, j+1, 1)) + 1; IF (c > c_temp) THEN SET c = c_temp; END IF; SET cv0 = CONCAT(cv0, CHAR(c)), j = j + 1; END WHILE; SET cv1 = cv0, i = i + 1; END WHILE; RETURN (c); END""" soundex_match = """ CREATE DEFINER=`root`@`localhost` FUNCTION `soundex_match`(s1 VARCHAR(255) CHARACTER SET utf8, s2 VARCHAR(255) CHARACTER SET utf8) RETURNS int(1) DETERMINISTIC BEGIN DECLARE _result INT DEFAULT 0; IF SOUNDEX(s1) = SOUNDEX(s2) THEN SET _result = 1; END IF; RETURN _result; END""" cursor.execute(levenshtein_distance) cursor.execute(soundex_match) cursor.close() def _init_common_tables(self): #initializes common tables used by tables modeling git, issue tracker, forum and instant messaging data cursor = self._cnx.cursor() create_table_project = "CREATE TABLE IF NOT EXISTS project( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "name varchar(255), " \ "CONSTRAINT name UNIQUE (name)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_user = "******" \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "name varchar(256), " \ "email varchar(256), " \ "CONSTRAINT namem UNIQUE (name, email) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_user_alias = "CREATE TABLE IF NOT EXISTS user_alias ( " \ "user_id int(20), " \ "alias_id int(20), " \ "CONSTRAINT a UNIQUE (user_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" cursor.execute(create_table_project) cursor.execute(create_table_user) cursor.execute(create_table_user_alias) def _init_shared_tables_issue_tracker_communication_channels(self): #initializes shared tables used by tables modeling issue tracker, forum and instant messaging data cursor = self._cnx.cursor() create_table_label = "CREATE TABLE IF NOT EXISTS label ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "name varchar(256), " \ "CONSTRAINT name UNIQUE (name) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_message = "CREATE TABLE IF NOT EXISTS message ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "own_id varchar(20), " \ "pos int(10), " \ "type_id int(20), " \ "issue_id int(20), " \ "topic_id int(20), " \ "channel_id int(20), " \ "body longblob, " \ "votes int(20), " \ "author_id int(20), " \ "created_at timestamp NULL DEFAULT NULL," \ "CONSTRAINT ip UNIQUE (issue_id, topic_id, channel_id, own_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_message_dependency = "CREATE TABLE IF NOT EXISTS message_dependency ( " \ "source_message_id int(20), " \ "target_message_id int(20), " \ "PRIMARY KEY st (source_message_id, target_message_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_message_type = "CREATE TABLE IF NOT EXISTS message_type ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "name varchar(255), " \ "CONSTRAINT name UNIQUE (name) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" insert_message_types = "INSERT IGNORE INTO message_type VALUES (NULL, 'question'), " \ "(NULL, 'answer'), " \ "(NULL, 'comment'), " \ "(NULL, 'accepted_answer'), " \ "(NULL, 'reply'), " \ "(NULL, 'file_upload'), " \ "(NULL, 'info');" create_table_attachment = "CREATE TABLE IF NOT EXISTS attachment ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "own_id varchar(20), " \ "message_id int(20), " \ "name varchar(256), " \ "extension varchar(10), " \ "bytes int(20), " \ "url varchar(512), " \ "CONSTRAINT ip UNIQUE (message_id, own_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" cursor.execute(create_table_label) cursor.execute(create_table_message) cursor.execute(create_table_message_dependency) cursor.execute(create_table_message_type) cursor.execute(insert_message_types) cursor.execute(create_table_attachment) cursor.close() def _init_git_tables(self): #initializes tables used to model git data cursor = self._cnx.cursor() create_table_repository = "CREATE TABLE IF NOT EXISTS repository( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "project_id int(20), " \ "name varchar(255), " \ "CONSTRAINT name UNIQUE (name)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_reference = "CREATE TABLE IF NOT EXISTS reference( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "repo_id int(20), " \ "name varchar(255), " \ "type varchar(255), " \ "CONSTRAINT name UNIQUE (repo_id, name, type) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_commit = "CREATE TABLE IF NOT EXISTS commit(" \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "repo_id int(20), " \ "sha varchar(512), " \ "message varchar(512), " \ "author_id int(20), " \ "committer_id int(20), " \ "authored_date timestamp NULL DEFAULT NULL, " \ "committed_date timestamp NULL DEFAULT NULL, " \ "size int(20), " \ "INDEX sha (sha), " \ "INDEX auth (author_id), " \ "INDEX comm (committer_id), " \ "CONSTRAINT s UNIQUE (sha, repo_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_commit_parent = "CREATE TABLE IF NOT EXISTS commit_parent(" \ "repo_id int(20), " \ "commit_id int(20), " \ "commit_sha varchar(512), " \ "parent_id int(20), " \ "parent_sha varchar(512), " \ "PRIMARY KEY copa (repo_id, commit_id, parent_id), " \ "CONSTRAINT cshapsha UNIQUE (repo_id, commit_id, parent_sha) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_commits2reference = "CREATE TABLE IF NOT EXISTS commit_in_reference(" \ "repo_id int(20), " \ "commit_id int(20), " \ "ref_id int(20), " \ "PRIMARY KEY core (commit_id, ref_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_file = "CREATE TABLE IF NOT EXISTS file( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "repo_id int(20), " \ "name varchar(512), " \ "ext varchar(255), " \ "CONSTRAINT rerena UNIQUE (repo_id, name) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_file_renamed = "CREATE TABLE IF NOT EXISTS file_renamed ( " \ "repo_id int(20), " \ "current_file_id int(20), " \ "previous_file_id int(20), " \ "file_modification_id int(20), " \ "PRIMARY KEY cpc (current_file_id, previous_file_id, file_modification_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_file_modification = "CREATE TABLE IF NOT EXISTS file_modification ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "commit_id int(20), " \ "file_id int(20), " \ "status varchar(10), " \ "additions numeric(10), " \ "deletions numeric(10), " \ "changes numeric(10), " \ "patch longblob, " \ "CONSTRAINT cf UNIQUE (commit_id, file_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_line_detail = "CREATE TABLE IF NOT EXISTS line_detail( " \ "file_modification_id int(20)," \ "type varchar(25), " \ "line_number numeric(20), " \ "is_commented numeric(1), " \ "is_partially_commented numeric(1), " \ "is_empty numeric(1), " \ "content longblob, " \ "PRIMARY KEY fityli (file_modification_id, type, line_number) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" # adding it here because "file_dependency" depends on "file" table creation. # @todo: find a way to move the following table creation to separate section # make "extract_dependency_relations" API interface completely independent. create_table_file_dependency = "CREATE TABLE file_dependency ( " \ "repo_id int(20), " \ "ref_id int(20), " \ "source_file_id int(20), " \ "target_file_id int(20), " \ "CONSTRAINT dep UNIQUE (repo_id, ref_id, source_file_id, target_file_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" cursor.execute(create_table_repository) cursor.execute(create_table_reference) cursor.execute(create_table_commit) cursor.execute(create_table_commit_parent) cursor.execute(create_table_commits2reference) cursor.execute(create_table_file) cursor.execute(create_table_file_renamed) cursor.execute(create_table_file_modification) cursor.execute(create_table_line_detail) cursor.execute(create_table_file_dependency) cursor.close() def _init_issue_tracker_tables(self): #initializes tables used to model issue tracker data cursor = self._cnx.cursor() create_table_issue_tracker = "CREATE TABLE IF NOT EXISTS issue_tracker ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "repo_id int(20), " \ "name varchar(512), " \ "type varchar(512), " \ "CONSTRAINT name UNIQUE (name)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue = "CREATE TABLE IF NOT EXISTS issue ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "own_id varchar(20), " \ "issue_tracker_id int(20), " \ "summary varchar(512), " \ "component varchar(256), " \ "version varchar(256), " \ "hardware varchar(256), " \ "priority varchar(256), " \ "severity varchar(256), " \ "reference_id int(20), " \ "reporter_id int(20), " \ "created_at timestamp NULL DEFAULT NULL, " \ "last_change_at timestamp NULL DEFAULT NULL, " \ "CONSTRAINT ioi UNIQUE (issue_tracker_id, own_id), " \ "INDEX u (reporter_id), " \ "INDEX r (reference_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue_assignee = "CREATE TABLE IF NOT EXISTS issue_assignee ( " \ "issue_id int(20), " \ "assignee_id int(20), " \ "PRIMARY KEY il (issue_id, assignee_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue_subscriber = "CREATE TABLE IF NOT EXISTS issue_subscriber ( " \ "issue_id int(20), " \ "subscriber_id int(20), " \ "PRIMARY KEY il (issue_id, subscriber_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue_event = "CREATE TABLE IF NOT EXISTS issue_event ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "issue_id int(20), " \ "event_type_id int(20), " \ "detail varchar(256), " \ "creator_id int(20), " \ "created_at timestamp NULL DEFAULT NULL, " \ "target_user_id int(20), " \ "CONSTRAINT iecc UNIQUE (issue_id, event_type_id, creator_id, created_at, detail) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue_event_type = "CREATE TABLE IF NOT EXISTS issue_event_type ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "name varchar(256), " \ "CONSTRAINT name UNIQUE (name) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue_labelled = "CREATE TABLE IF NOT EXISTS issue_labelled ( " \ "issue_id int(20), " \ "label_id int(20), " \ "PRIMARY KEY il (issue_id, label_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_issue_commit_dependency = "CREATE TABLE IF NOT EXISTS issue_commit_dependency ( " \ "issue_id int(20), " \ "commit_id int(20), " \ "PRIMARY KEY ict (issue_id, commit_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_issue_dependency = "CREATE TABLE IF NOT EXISTS issue_dependency ( " \ "issue_source_id int(20), " \ "issue_target_id int(20), " \ "type_id int(20), " \ "PRIMARY KEY st (issue_source_id, issue_target_id, type_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_issue_dependency_type = "CREATE TABLE IF NOT EXISTS issue_dependency_type (" \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "name varchar(256), " \ "CONSTRAINT name UNIQUE (name) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" insert_issue_dependency_type = "INSERT IGNORE INTO issue_dependency_type VALUES (NULL, 'block'), " \ "(NULL, 'depends'), " \ "(NULL, 'related'), " \ "(NULL, 'duplicated');" cursor.execute(create_table_issue_tracker) cursor.execute(create_table_issue) cursor.execute(create_table_issue_assignee) cursor.execute(create_table_issue_subscriber) cursor.execute(create_table_issue_event) cursor.execute(create_table_issue_event_type) cursor.execute(create_table_issue_labelled) cursor.execute(create_issue_commit_dependency) cursor.execute(create_table_issue_dependency) cursor.execute(create_issue_dependency_type) cursor.execute(insert_issue_dependency_type) cursor.close() def _init_forum_tables(self): #initializes tables used to model forum data cursor = self._cnx.cursor() create_table_forum = "CREATE TABLE IF NOT EXISTS forum ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "project_id int(20), " \ "name varchar(512), " \ "type varchar(512), " \ "CONSTRAINT name UNIQUE (name)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_topic = "CREATE TABLE IF NOT EXISTS topic ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "own_id varchar(20), " \ "forum_id int(20), " \ "name varchar(256), " \ "votes int(10), " \ "views int(10), " \ "created_at timestamp NULL DEFAULT NULL, " \ "last_change_at timestamp NULL DEFAULT NULL, " \ "CONSTRAINT name UNIQUE (forum_id, own_id)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_topic_labelled = "CREATE TABLE IF NOT EXISTS topic_labelled ( " \ "topic_id int(20), " \ "label_id int(20), " \ "PRIMARY KEY il (topic_id, label_id) " \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" cursor.execute(create_table_forum) cursor.execute(create_table_topic) cursor.execute(create_table_topic_labelled) cursor.close() def _init_instant_messaging_tables(self): #initializes tables used to model instant messaging data cursor = self._cnx.cursor() create_table_instant_messaging = "CREATE TABLE IF NOT EXISTS instant_messaging ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "project_id int(20), " \ "name varchar(512), " \ "type varchar(512), " \ "CONSTRAINT name UNIQUE (name)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" create_table_channel = "CREATE TABLE IF NOT EXISTS channel ( " \ "id int(20) AUTO_INCREMENT PRIMARY KEY, " \ "own_id varchar(20), " \ "instant_messaging_id int(20), " \ "name varchar(256), " \ "description varchar(512), " \ "created_at timestamp NULL DEFAULT NULL, " \ "last_change_at timestamp NULL DEFAULT NULL, " \ "CONSTRAINT name UNIQUE (instant_messaging_id, own_id)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;" cursor.execute(create_table_instant_messaging) cursor.execute(create_table_channel) cursor.close()
class Git2DbMain(): """ This class handles the import of Git data """ NUM_PROCESSES = 5 def __init__(self, db_name, project_name, repo_name, git_repo_path, before_date, import_type, references, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of the Git repository to import :type git_repo_path: str :param git_repo_path: the local path of the Git repository :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type import_type: int :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail :type references: list str :param references: list of references to import :type num_processes: int :param num_processes: number of processes to import the data (default 5) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-git-" + db_name + "-" + project_name + "-" + repo_name self._git_repo_path = git_repo_path self._project_name = project_name self._db_name = db_name self._repo_name = repo_name self._before_date = before_date self._import_type = import_type self._references = references if num_processes: self._num_processes = num_processes else: self._num_processes = Git2DbMain.NUM_PROCESSES config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_existing_references(self, repo_id): #retrieves already imported references existing_refs = [] cursor = self._dao.get_cursor() query = "SELECT ref.name " \ "FROM reference ref JOIN repository r ON ref.repo_id = r.id " \ "WHERE r.id = %s" arguments = [repo_id] self._dao.execute(cursor, query, arguments) row = self._dao.fetchone(cursor) while row: existing_refs.append(row[0]) row = self._dao.fetchone(cursor) self._dao.close_cursor(cursor) return existing_refs def _get_info_contribution(self, repo_id): #processes Git data existing_refs = self._get_existing_references(repo_id) queue_references = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_references, results) for reference in self._querier.get_references(): if self._references: if reference[0] in self._references: git_ref_extractor = Git2DbReference( self._db_name, repo_id, self._git_repo_path, self._before_date, self._import_type, reference[0], "", self._config, self._log_path) queue_references.put(git_ref_extractor) else: if reference[0] not in existing_refs: git_ref_extractor = Git2DbReference( self._db_name, repo_id, self._git_repo_path, self._before_date, self._import_type, reference[0], "", self._config, self._log_path) queue_references.put(git_ref_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_references) # Wait for all of the tasks to finish queue_references.join() def extract(self): """ extracts Git data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("Git2DbMain started") start_time = datetime.now() self._querier = GitQuerier(self._git_repo_path, self._logger) self._dao = GitDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) self._dao.insert_repo(project_id, self._repo_name) repo_id = self._dao.select_repo_id(self._repo_name) #info contribution does not need a connection to the db self._get_info_contribution(repo_id) self._dao.restart_connection() self._dao.fix_commit_parent_table(repo_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("Git2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except Exception: self._logger.error("Git2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class StackOverflow2DbMain(): """ This class handles the import of Stackoverflow data """ def __init__(self, db_name, project_name, type, forum_name, search_query, before_date, tokens, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type type: str :param type: type of the forum (Stackoverflow, Eclipse forum) :type forum_name: str :param forum_name: the name of the forum to import :type search_query: str :param search_query: a label used to mark questions in Stackoverflow :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type tokens: list str :param tokens: list of Stackoverflow tokens :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-stackoverflow-" + db_name + "-" + project_name + "-" + forum_name self._type = type self._forum_name = forum_name self._search_query = search_query.strip() self._project_name = project_name self._db_name = db_name self._before_date = before_date self._tokens = tokens config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_topics(self, forum_id): # processes Stackoverflow questions topic_imported = self._dao.get_topic_own_ids(forum_id) topic_ids = list( set( self._querier.get_topic_ids(self._search_query, self._before_date)) - set(topic_imported)) topic_ids.sort() intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, len(self._tokens)) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) pos = 0 for interval in intervals: topic_extractor = StackOverflowTopic2Db(self._db_name, forum_id, interval, self._tokens[pos], self._config, self._log_path) queue_extractors.put(topic_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def extract(self): """ extracts Stackoverflow data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("StackOverflow2DbMain started") start_time = datetime.now() self._querier = StackOverflowQuerier(self._tokens[0], self._logger) self._dao = StackOverflowDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) forum_id = self._dao.insert_forum(project_id, self._forum_name, self._type) self._get_topics(forum_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("StackOverflow2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("StackOverflow2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class StackOverflowTopic2Db(object): """ This class handles the import of Stackoverflow topics """ def __init__(self, db_name, forum_id, interval, token, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type forum_id: int :param forum_id: the id of an existing forum in the DB :type interval: list int :param interval: a list of topic ids to import :type token: str :param token: a Stackoverflow token :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._interval = interval self._db_name = db_name self._forum_id = forum_id self._token = token self._config = config self._logging_util = LoggingUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): try: log_path = self._log_root_path + "-topic2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") self._querier = StackOverflowQuerier(self._token, self._logger) self._dao = StackOverflowDao(self._config, self._logger) self.extract() except Exception: self._logger.error("StackOverflowTopic2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _extract_answers(self, answers, topic_id, message_id): # extracts answers for a in answers: own_id = self._querier.get_container_own_id(a) body = self._querier.get_container_body(a) author_id = self._dao.get_user_id( self._querier.get_container_author(a)) created_at = self._querier.get_container_created_at(a) votes = self._querier.get_container_votes(a) if self._querier.is_accepted_answer(a): message_type = "accepted_answer" else: message_type = "answer" answer_message_id = self._dao.select_message_id(own_id, topic_id) if answer_message_id: self._dao.update_message(own_id, topic_id, body, votes) else: self._dao.insert_message( own_id, self.pos, self._dao.get_message_type_id(message_type), topic_id, self._querier.remove_html_tags(body), votes, author_id, created_at) answer_message_id = self._dao.select_message_id( own_id, topic_id) self._dao.insert_message_dependency(message_id, answer_message_id) self._extract_attachments(body, answer_message_id) self.pos += 1 self._extract_comment_messages(self._querier.get_comments(a), topic_id, answer_message_id) def _extract_comment_messages(self, comments, topic_id, message_id): # extracts comments for c in comments: own_id = self._querier.get_container_own_id(c) body = self._querier.get_container_body(c) author_id = self._dao.get_user_id( self._querier.get_container_author(c)) created_at = self._querier.get_container_created_at(c) votes = self._querier.get_container_votes(c) comment_message_id = self._dao.select_message_id(own_id, topic_id) if comment_message_id: self._dao.update_message(own_id, topic_id, body, votes) else: self._dao.insert_message( own_id, self.pos, self._dao.get_message_type_id("comment"), topic_id, self._querier.remove_html_tags(body), votes, author_id, created_at) comment_message_id = self._dao.select_message_id( own_id, topic_id) self._dao.insert_message_dependency(message_id, comment_message_id) self._extract_attachments(body, comment_message_id) self.pos += 1 def _extract_attachments(self, body, message_id): # extracts attachments attachments = self._querier.get_attachments(body) if attachments: self._insert_attachments(attachments, message_id) def _insert_labels(self, labels, topic_id): for l in labels: self._dao.insert_label(l) label_id = self._dao.select_label_id(l) self._dao.assign_label_to_topic(label_id, topic_id) def _insert_attachments(self, attachments, message_id): # inserts attachments pos = 0 for attachment in attachments: attachment_name = self._querier.get_attachment_name(attachment) attachment_own_id = self._querier.generate_attachment_id( message_id, pos) attachment_url = self._querier.get_attachment_url(attachment) self._dao.insert_attachment(attachment_own_id, message_id, attachment_name, attachment_url) pos += 1 def _extract_topic(self, topic): # extracts a topic last_change_at = self._querier.get_topic_last_change_at(topic) own_id = self._querier.get_container_own_id(topic) if self._dao.get_topic_last_change_at( own_id, self._forum_id) != last_change_at: name = self._querier.get_topic_name(topic) votes = self._querier.get_container_votes(topic) views = self._querier.get_topic_views(topic) created_at = self._querier.get_container_created_at(topic) topic_id = self._dao.insert_topic(own_id, self._forum_id, name, votes, views, created_at, last_change_at) author_id = self._dao.get_user_id( self._querier.get_container_author(topic)) labels = self._querier.get_topic_labels(topic) self._insert_labels(labels, topic_id) self.pos = 0 body = self._querier.get_container_body(topic) message_id = self._dao.select_message_id(own_id, topic_id) if message_id: self._dao.update_message(own_id, topic_id, self._querier.remove_html_tags(body), votes) else: self._dao.insert_message( own_id, self.pos, self._dao.get_message_type_id("question"), topic_id, self._querier.remove_html_tags(body), votes, author_id, created_at) message_id = self._dao.select_message_id(own_id, topic_id) self._extract_attachments(body, message_id) self.pos += 1 self._extract_comment_messages(self._querier.get_comments(topic), topic_id, message_id) self._extract_answers(self._querier.get_answers(topic), topic_id, message_id) def extract(self): """ extracts Stackoverflow topic data and stores it in the DB """ try: self._logger.info("StackOverflowTopic2Db started") start_time = datetime.now() for topic_id in self._interval: topic = self._querier.get_topic(topic_id) if topic: self._extract_topic(topic) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("StackOverflowTopic2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except Exception: self._logger.error("StackOverflowTopic2Db failed", exc_info=True)
class GitHubIssue2DbMain(): """ This class handles the import of GitHub issue data """ def __init__(self, db_name, project_name, repo_name, type, issue_tracker_name, url, before_date, tokens, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of an existing repository in the DB :type type: str :param type: type of the issue tracker (Bugzilla, GitHub) :type issue_tracker_name: str :param issue_tracker_name: the name of the issue tracker to import :type url: str :param url: full name of the GitHub repository :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type tokens: list str :param token: list of GitHub tokens :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-github-" + db_name + "-" + project_name + "-" + issue_tracker_name self._type = type self._url = url self._project_name = project_name self._db_name = db_name self._issue_tracker_name = issue_tracker_name self._repo_name = repo_name self._before_date = before_date self._tokens = tokens config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _pass_list_as_argument(self, elements): return '-'.join([str(e) for e in elements]) def _insert_issue_data(self, repo_id, issue_tracker_id): #processes issue data imported = self._dao.get_already_imported_issue_ids( issue_tracker_id, repo_id) issues = list( set(self._querier.get_issue_ids(self._before_date)) - set(imported)) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( issues, len(self._tokens)) if len(i) > 0 ] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_intervals, results) pos = 0 for interval in intervals: issue_extractor = GitHubIssue2Db(self._db_name, repo_id, issue_tracker_id, self._url, interval, self._tokens[pos], self._config, self._log_path) queue_intervals.put(issue_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _insert_issue_dependencies(self, repo_id, issue_tracker_id): #processes issue dependency data issues = self._dao.get_already_imported_issue_ids( issue_tracker_id, repo_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( issues, len(self._tokens)) if len(i) > 0 ] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_intervals, results) pos = 0 for interval in intervals: issue_dependency_extractor = GitHubIssueDependency2Db( self._db_name, repo_id, issue_tracker_id, self._url, interval, self._tokens[pos], self._config, self._log_path) queue_intervals.put(issue_dependency_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _split_issue_extraction(self): #splits the issues found according to the number of processes project_id = self._dao.select_project_id(self._project_name) repo_id = self._dao.select_repo_id(project_id, self._repo_name) issue_tracker_id = self._dao.insert_issue_tracker( repo_id, self._issue_tracker_name, self._type) self._insert_issue_data(repo_id, issue_tracker_id) self._dao.restart_connection() #self._insert_issue_dependencies(repo_id, issue_tracker_id) def extract(self): """ extracts GitHub issue data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("GitHubIssue2DbMain started") start_time = datetime.now() self._querier = GitHubQuerier(self._url, self._tokens[0], self._logger) self._dao = GitHubDao(self._config, self._logger) self._split_issue_extraction() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("GitHubIssue2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("GitHubIssue2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class EclipseForum2DbMain(): """ This class handles the import of Eclipse forum data """ NUM_PROCESSES = 2 def __init__(self, db_name, project_name, type, forum_name, url, before_date, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type type: str :param type: type of the forum (Stackoverflow, Eclipse forum) :type forum_name: str :param forum_name: the name of the forum to import :type url: str :param url: the URL of the forum :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type num_processes: int :param num_processes: number of processes to import the data (default 2) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name self._type = type self._url = url self._forum_name = forum_name self._project_name = project_name self._db_name = db_name self._before_date = before_date config.update({'database': db_name}) self._config = config if num_processes: self._num_processes = num_processes else: self._num_processes = EclipseForum2DbMain.NUM_PROCESSES self._logging_util = LoggingUtil() self._date_util = DateUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_topic_info(self, forum_id, topic): # get topic information own_id = self._querier.get_topic_own_id(topic) title = self._querier.get_topic_title(topic) views = self._querier.get_topic_views(topic) last_change_at = self._date_util.get_timestamp( self._querier.get_last_change_at(topic), "%a, %d %B %Y %H:%M") topic_id = self._dao.select_topic_id(forum_id, own_id) if not topic_id: if self._before_date: topic_created_at = self._querier.get_topic_created_at(topic) if self._date_util.get_timestamp(topic_created_at, "%a, %d %B %Y") <= \ self._date_util.get_timestamp(self._before_date, "%Y-%m-%d"): self._dao.insert_topic(own_id, forum_id, title, views, last_change_at) else: self._dao.insert_topic(own_id, forum_id, title, views, last_change_at) topic_id = self._dao.select_topic_id(forum_id, own_id) return topic_id def _get_topic_ids(self, forum_id): # get list of topic ids of a forum topic_ids = [] next_page = True while next_page: topics_on_page = self._querier.get_topics() for t in topics_on_page: topic_id = self._get_topic_info(forum_id, t) topic_ids.append(topic_id) next_page = self._querier.go_next_page() return [ti for ti in topic_ids if ti is not None] def _get_topics(self, forum_id): # insert topics to DB self._querier.start_browser() topic_ids = self._get_topic_ids(forum_id) self._querier.close_browser() intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, self._num_processes) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_extractors, results) for interval in intervals: topic_extractor = EclipseTopic2Db(self._db_name, forum_id, interval, self._config, self._log_path) queue_extractors.put(topic_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def extract(self): """ extracts Eclipse forum data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("EclipseForum2DbMain started") start_time = datetime.now() self._querier = EclipseForumQuerier(self._url, self._logger) self._dao = EclipseForumDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) forum_id = self._dao.insert_forum(project_id, self._forum_name, self._type) self._get_topics(forum_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseForum2DbMain finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("EclipseForum2DbMain failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class BugzillaIssue2Db(object): """ This class handles the import of Bugzilla issues """ def __init__(self, db_name, repo_id, issue_tracker_id, url, product, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type issue_tracker_id: int :param issue_tracker_id: the id of an existing issue tracker in the DB :type url: str :param url: the URL of the bugzilla issue tracker :type product: str :param product: the name of the product in the bugzilla issue tracker :type interval: list int :param interval: a list of issue ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._url = url self._product = product self._db_name = db_name self._repo_id = repo_id self._issue_tracker_id = issue_tracker_id self._interval = interval self._config = config self._logging_util = LoggingUtil() self._date_util = DateUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): try: log_path = self._log_root_path + "-issue2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") self._querier = BugzillaQuerier(self._url, self._product, self._logger) self._dao = BugzillaDao(self._config, self._logger) self.extract() except Exception: self._logger.error("BugzillaIssue2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _is_email(self, str): # checks that a string is an email return parseaddr(str)[1] != '' and '@' in str def _extract_attachment(self, issue_comment_id, attachment_id): # inserts an attachment attachment_info = self._querier.get_attachment(attachment_id) if '.' in attachment_info.name: name = ('.').join(attachment_info.name.split('.')[:-1]).strip() extension = attachment_info.name.split('.')[-1].lower() else: name = attachment_info.name extension = "patch" size = sys.getsizeof(attachment_info) self._dao.insert_attachment(attachment_id, issue_comment_id, name, extension, size, None) def _extract_issue_event(self, action, action_content, creator_id, created_at, issue_id, field_name): # inserts an issue event event_type = action + '-' + field_name self._dao.insert_event_type(event_type) event_type_id = self._dao.select_event_type(event_type) target_user_id = None if ',' in action_content and field_name in [ "keywords", "depends_on", "cc", "flagtypes.name", "blocks", "whiteboard", "see_also" ]: contents = action_content.split(',') for content in contents: content = content.strip() if self._is_email(content): target_user_id = self._dao.get_user_id( self._querier.get_user_name(content), content) self._dao.insert_issue_event(issue_id, event_type_id, content, creator_id, created_at, target_user_id) else: if self._is_email(action_content): target_user_id = self._dao.get_user_id( self._querier.get_user_name(action_content), action_content) self._dao.insert_issue_event(issue_id, event_type_id, action_content, creator_id, created_at, target_user_id) def _extract_history(self, issue_id, history): # inserts the history of an issue for event in history: try: created_at = self._date_util.get_timestamp( self._querier.get_event_property(event, 'when'), '%Y%m%dT%H:%M:%S') creator_email = self._querier.get_event_property(event, 'who') creator_id = self._dao.get_user_id( self._querier.get_user_name(creator_email), creator_email) for change in self._querier.get_event_property( event, 'changes'): removed = self._querier.get_change_property( change, 'removed') field_name = self._querier.get_change_property( change, 'field_name').lower() added = self._querier.get_change_property(change, 'added') if removed != '': action = "removed" self._extract_issue_event(action, removed, creator_id, created_at, issue_id, field_name) if added != '': action = "added" self._extract_issue_event(action, added, creator_id, created_at, issue_id, field_name) except Exception: self._logger.warning("event at (" + str(created_at) + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_subscribers(self, issue_id, subscribers): # inserts subscribers of an issue for subscriber in subscribers: try: subscriber_id = self._dao.get_user_id( self._querier.get_user_name(subscriber), subscriber) self._dao.insert_subscriber(issue_id, subscriber_id) except Exception: self._logger.warning("subscriber (" + subscriber + ") not inserted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_assignee(self, issue_id, assignee): # inserts the assignee of an issue try: assignee_id = self._dao.get_user_id( self._querier.get_user_name(assignee), assignee) self._dao.insert_assignee(issue_id, assignee_id) except Exception: self._logger.warning( "assignee (" + assignee + ") not inserted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_comments(self, issue_id, comments): # inserts the comments of an issue for comment in comments: try: own_id = self._querier.get_comment_property(comment, 'id') body = self._querier.get_comment_property(comment, 'text') position = self._querier.get_comment_property(comment, 'count') author_email = self._querier.get_comment_property( comment, 'author') author_id = self._dao.get_user_id( self._querier.get_user_name(author_email), author_email) created_at = self._date_util.get_timestamp( self._querier.get_comment_property(comment, 'creation_time'), '%Y%m%dT%H:%M:%S') self._dao.insert_issue_comment( own_id, position, self._dao.get_message_type_id("comment"), issue_id, body, None, author_id, created_at) attachment_id = self._querier.get_comment_property( comment, 'attachment_id') if attachment_id: issue_comment_id = self._dao.select_issue_comment_id( own_id, issue_id, created_at) self._extract_attachment(issue_comment_id, attachment_id) except Exception: self._logger.warning("comment(" + str(position) + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) continue def _extract_labels(self, issue_id, labels): # inserts the labels of an issue for label in labels: try: digested_label = re.sub("^\W+", "", re.sub("\W+$", "", label.lower())) self._dao.insert_label(digested_label.strip()) label_id = self._dao.select_label_id(digested_label) self._dao.assign_label_to_issue(issue_id, label_id) except Exception: self._logger.warning("label (" + label + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_issue_commit_dependency(self, issue_id, commits): # inserts the dependencies between an issue and commits flattened_list = [y for x in commits for y in x] for id in flattened_list: if "commit" in id: extracted = id.split("?id=")[1].strip() commit_id = self._dao.select_commit(extracted, self._repo_id) self._dao.insert_issue_commit_dependency(issue_id, commit_id) def _is_duplicated(self, issue): flag = True try: issue.dupe_of except: flag = False return flag def _get_issue_info(self, issue_own_id): # processes each single issue flag_insert_issue_data = False issue = self._querier.get_issue(issue_own_id) summary = self._querier.get_issue_summary(issue) component = self._querier.get_issue_component(issue) version = self._querier.get_issue_version(issue) hardware = self._querier.get_issue_operating_system(issue) priority = self._querier.get_issue_priority(issue) severity = self._querier.get_issue_severity(issue) created_at = self._querier.get_issue_creation_time(issue) last_change_at = self._querier.get_issue_last_change_time(issue) reference_id = self._dao.find_reference_id(version, issue_own_id, self._repo_id) issue_creator_email = self._querier.get_issue_creator(issue) user_id = self._dao.get_user_id( self._querier.get_user_name(issue_creator_email), issue_creator_email) stored_issue_last_change = self._dao.select_last_change_issue( issue_own_id, self._issue_tracker_id, self._repo_id) if stored_issue_last_change: if last_change_at != stored_issue_last_change: flag_insert_issue_data = True self._dao.update_issue(issue_own_id, self._issue_tracker_id, summary, component, version, hardware, priority, severity, reference_id, last_change_at) else: flag_insert_issue_data = True self._dao.insert_issue(issue_own_id, self._issue_tracker_id, summary, component, version, hardware, priority, severity, reference_id, user_id, created_at, last_change_at) if flag_insert_issue_data: issue_id = self._dao.select_issue_id(issue_own_id, self._issue_tracker_id, self._repo_id) try: self._extract_labels(issue_id, self._querier.get_issue_keywords(issue)) except Exception: self._logger.error( "BugzillaError when extracting keywords for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) try: self._extract_comments(issue_id, self._querier.get_issue_comments(issue)) except Exception: self._logger.error( "BugzillaError when extracting comments for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) try: self._extract_history(issue_id, self._querier.get_issue_history(issue)) except Exception: self._logger.error( "BugzillaError when extracting history for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) if issue.cc: self._extract_subscribers(issue_id, self._querier.get_issue_cc(issue)) if issue.assigned_to: self._extract_assignee(issue_id, self._querier.get_issue_assignee(issue)) if issue.see_also: self._extract_issue_commit_dependency( issue_id, [self._querier.get_issue_see_also(issue)]) def _get_issues(self): # processes issues for issue_id in self._interval: try: self._get_issue_info(issue_id) except Exception: self._logger.error("something went wrong for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def extract(self): """ extracts Bugzilla issue data and stores it in the DB """ try: self._logger.info("BugzillaIssue2Db started") start_time = datetime.now() self._get_issues() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("BugzillaIssue2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except Exception: self._logger.error("BugzillaIssue2Db failed", exc_info=True)
class BugzillaIssue2DbUpdate(): """ This class handles the update of Bugzilla issue tracker data """ NUM_PROCESSES = 3 def __init__(self, db_name, project_name, repo_name, issue_tracker_name, url, product, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of an existing repository in the DB :type issue_tracker_name: str :param issue_tracker_name: the name of the issue tracker to import :type url: str :param url: the URL of the issue tracker :type product: str :param product: the name of the product to import from the issue tracker :type num_processes: int :param num_processes: number of processes to import the data (default 3) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "update-bugzilla-" + db_name + "-" + project_name + "-" + issue_tracker_name self._issue_tracker_name = issue_tracker_name self._url = url self._product = product self._project_name = project_name self._db_name = db_name self._repo_name = repo_name if num_processes: self._num_processes = num_processes else: self._num_processes = BugzillaIssue2DbUpdate.NUM_PROCESSES config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._dao = None def _update_issue_content(self, repo_id, issue_tracker_id, intervals, url): #updates issues already stored in the DB queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results) for interval in intervals: issue_extractor = BugzillaIssue2Db(self._db_name, repo_id, issue_tracker_id, url, self._product, interval, self._config, self._log_path) queue_intervals.put(issue_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _update_issue_dependency(self, repo_id, issue_tracker_id, intervals, url): #updates issue dependencies already stored in the DB queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results) for interval in intervals: issue_dependency_extractor = BugzillaIssueDependency2Db(self._db_name, repo_id, issue_tracker_id, url, self._product, interval, self._config, self._log_path) queue_intervals.put(issue_dependency_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _update_issues(self): #updates issues project_id = self._dao.select_project_id(self._project_name) repo_id = self._dao.select_repo_id(project_id, self._repo_name) issue_tracker_id = self._dao.select_issue_tracker_id(repo_id, self._issue_tracker_name) issue_tracker_url = self._url if issue_tracker_id: cursor = self._dao.get_cursor() query = "SELECT i.own_id FROM issue i " \ "JOIN issue_tracker it ON i.issue_tracker_id = it.id " \ "WHERE issue_tracker_id = %s AND repo_id = %s " \ "ORDER BY i.own_id ASC;" arguments = [issue_tracker_id, repo_id] self._dao.execute(cursor, query, arguments) issues = [] row = self._dao.fetchone(cursor) while row: issues.append(row[0]) row = self._dao.fetchone(cursor) self._dao.close_cursor(cursor) if issues: intervals = [i for i in multiprocessing_util.get_tasks_intervals(issues, self._num_processes) if len(i) > 0] self._update_issue_content(repo_id, issue_tracker_id, intervals, issue_tracker_url) self._update_issue_dependency(repo_id, issue_tracker_id, intervals, issue_tracker_url) def update(self): """ updates the Bugzilla issue tracker data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info") self._logger.info("BugzillaIssue2DbUpdate started") start_time = datetime.now() self._dao = BugzillaDao(self._config, self._logger) self._update_issues() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("BugzillaIssue2DbUpdate finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except: self._logger.error("BugzillaIssue2DbUpdate failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class FileJsonExporter: """ This class handles the export of file information via JSON. It allows to use a former version of the bus factor tool (https://github.com/SOM-Research/busfactor) """ LOG_FOLDER_PATH = "logs" def __init__(self, config, db_name, log_root_path): """ :type config: dict :param config: the DB configuration file :type db_name: str :param config: name of an existing DB :type log_root_path: str :param log_root_path: the log path """ self._date_util = DateUtil() self._db_util = DbUtil() self._logging_util = LoggingUtil() self._log_path = log_root_path + "export-file-json-" + db_name self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._db_name = db_name config.update({'database': db_name}) self._config = config self._cnx = self._db_util.get_connection(self._config) self._db_util.set_database(self._cnx, self._db_name) self._db_util.set_settings(self._cnx) self._file_util = FileUtil(self._config, self._logger) def get_diff_info(self, patch_content): if patch_content: first_line = patch_content.split('\n')[0] if re.match(r"^@@(\s|\+|\-|\d|,)+@@", first_line, re.M): diff_info = first_line.split("@@")[1] else: diff_info = "Binary file" else: diff_info = 'Renamed file' return diff_info def get_diff_content(self, patch_content): if patch_content: lines = patch_content.split('\n') if re.match(r"^@@(\s|\+|\-|\d|,)+@@", lines[0], re.M): first_line_content = lines[0].split("@@")[2] diff_content = lines[1:] diff_content.insert(0, first_line_content) diff_content = '\n'.join(diff_content) else: diff_content = "No content" else: diff_content = "No content" return diff_content def get_patch_info(self, content): diff_info = self.get_diff_info(content) diff_content = self.get_diff_content(content) return { 'info': diff_info, 'content': diff_content.decode('utf-8', 'ignore') } def get_changes_for_file(self, file_ids): file_modifications = [] cursor = self._cnx.cursor() query = "SELECT c.author_id, c.committer_id, c.authored_date, c.committed_date, c.sha, fm.additions, fm.deletions, fm.patch " \ "FROM file_modification fm JOIN file f ON fm.file_id = f.id " \ "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \ "JOIN reference r ON r.id = cin.ref_id " \ "JOIN commit c ON c.id = cin.commit_id " \ "WHERE f.id IN (" + ",".join([str(id) for id in file_ids]) + ") " \ "ORDER BY c.authored_date DESC" cursor.execute(query) row = cursor.fetchone() while row: author_id = row[0] committer_id = row[1] authored_date = row[2].strftime('%Y-%m-%d %H:%M:%S') committed_date = row[3].strftime('%Y-%m-%d %H:%M:%S') additions = str(row[4]) deletions = str(row[5]) sha = str(row[6]) patch = str(row[7]) patch_info = self.get_patch_info(patch) author_name, author_email = self.get_user_identity(author_id) if author_id != committer_id: committer_name, committer_email = self.get_user_identity( committer_id) else: committer_name = author_name committer_email = author_email author = {'name': author_name, 'email': author_email} committer = {'name': committer_name, 'email': committer_email} file_modifications.append({ 'author': author, 'authored_date': authored_date, 'committer': committer, 'committed_date': committed_date, 'additions': additions, 'deletions': deletions, 'sha': sha, 'patch': patch_info }) row = cursor.fetchone() cursor.close() return file_modifications def array2string(self, array): return ','.join(str(x) for x in array) def get_user_identity(self, user_id): found = None cursor = self._cnx.cursor() query = "SELECT u.name, u.email " \ "FROM user u " \ "JOIN (SELECT IFNULL(ua.alias_id, u.id) as user_id FROM user u LEFT JOIN user_alias ua ON u.id = ua.user_id WHERE u.id = %s) as aliased " \ "ON aliased.user_id = u.id" arguments = [user_id] cursor.execute(query, arguments) row = cursor.fetchone() if row: name = row[0] email = row[1] found = (name, email) return found def get_commits_info(self, file_ids): commits = [] cursor = self._cnx.cursor() query = "SELECT c.sha, c.message, r.name, c.author_id, c.committer_id, c.authored_date, c.committed_date " \ "FROM file_modification fm JOIN file f ON fm.file_id = f.id " \ "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \ "JOIN reference r ON r.id = cin.ref_id " \ "JOIN commit c ON c.id = cin.commit_id " \ "WHERE f.id IN (" + ",".join([str(id) for id in file_ids]) + ")" cursor.execute(query) row = cursor.fetchone() while row: sha = str(row[0]) message = str(row[1].encode('utf8')) ref = str(row[2]) author_id = row[3] committer_id = row[4] authored_date = row[5].strftime('%Y-%m-%d %H:%M:%S') committed_date = row[6].strftime('%Y-%m-%d %H:%M:%S') author_name, author_email = self.get_user_identity(author_id) if author_id != committer_id: committer_name, committer_email = self.get_user_identity( committer_id) else: committer_name = author_name committer_email = author_email author = {'name': author_name, 'email': author_email} committer = {'name': committer_name, 'email': committer_email} commits.append({ 'sha': sha, 'author': author, 'committer': committer, 'message': message, 'ref': ref, 'authored_date': authored_date, 'committed_date': committed_date }) row = cursor.fetchone() cursor.close() return commits def get_status_file(self, file_id): cursor = self._cnx.cursor() query = "SELECT fm.status, MAX(c.committed_date) AS last_modification " \ "FROM file_modification fm JOIN file f ON fm.file_id = f.id " \ "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \ "JOIN commit c ON c.id = cin.commit_id " \ "WHERE f.id = %s" arguments = [file_id] cursor.execute(query, arguments) row = cursor.fetchone() cursor.close() status = "" last_modification = "" if row: status = row[0] last_modification = row[1].strftime('%Y-%m-%d %H:%M:%S') return {'status': status, 'last_modification': last_modification} def add_file_info_to_json(self, references, repo_json): cursor = self._cnx.cursor() query = "SELECT f.id, f.name, f.ext, r.name, r.id " \ "FROM repository repo JOIN commit_in_reference cin ON repo.id = cin.repo_id " \ "JOIN file_modification fm ON fm.commit_id = cin.commit_id " \ "JOIN file f ON f.id = fm.file_id " \ "JOIN reference r ON r.id = cin.ref_id " \ "WHERE repo.id = %s AND r.name IN (" + ",".join(["'" + ref + "'" for ref in references]) + ") AND " \ "f.id NOT IN " \ "(SELECT deletions.file_id FROM " \ "(SELECT fm.file_id, c.committed_date " \ "FROM commit_in_reference cin " \ "JOIN file_modification fm ON fm.commit_id = cin.commit_id " \ "JOIN reference r ON r.id = cin.ref_id " \ "JOIN commit c ON c.id = fm.commit_id " \ "WHERE fm.status = 'deleted' AND cin.repo_id = %s ) as deletions " \ "JOIN " \ "(SELECT fm.file_id, max(c.committed_date) as committed_date " \ "FROM commit_in_reference cin " \ "JOIN file_modification fm ON fm.commit_id = cin.commit_id " \ "JOIN reference r ON r.id = cin.ref_id " \ "JOIN commit c ON c.id = fm.commit_id " \ "WHERE fm.status <> 'deleted' AND cin.repo_id = %s " \ "GROUP BY fm.file_id) AS last_action " \ "ON deletions.file_id = last_action.file_id " \ "WHERE deletions.committed_date > last_action.committed_date " \ "UNION " \ "SELECT fr.previous_file_id " \ "FROM file_renamed fr JOIN file f ON fr.previous_file_id = f.id " \ "JOIN file_modification fm ON fm.file_id = f.id " \ "JOIN commit_in_reference cin ON cin.commit_id = fm.commit_id " \ "JOIN reference r ON r.id = cin.ref_id " \ "WHERE cin.repo_id = %s) " \ "GROUP BY f.id, r.id" arguments = [ self._repo_id, self._repo_id, self._repo_id, self._repo_id ] cursor.execute(query, arguments) row = cursor.fetchone() while row: file_id = row[0] file_name = row[1] file_ext = row[2] ref_name = row[3] ref_id = row[4] status = self.get_status_file(file_id) file_history = self._file_util.get_file_history_by_id( file_id, ref_id) file_ids = list(set([h.get("file_id") for h in file_history])) commits = self.get_commits_info(file_ids) directories = self._file_util.get_directories(file_name) changes_info = self.get_changes_for_file(file_ids) file_info = { 'repo': self._repo_name, 'info': status, 'commits': commits, 'ref': ref_name, 'id': str(file_id), 'name': file_name.split('/')[-1], 'ext': file_ext, 'dirs': directories, 'file_changes': changes_info } repo_json.write(json.dumps(file_info) + "\n") row = cursor.fetchone() cursor.close() def export(self, repo_name, references, file_path): """ exports the file data to JSON format :type repo_name: str :param repo_name: name of the repository to analyse :type references: list str :param references: list of references to analyse :type file_path: str :param file_path: the path where to export the file information """ try: self._logger.info("FileJSONExporter started") start_time = datetime.now() repo_json = codecs.open(file_path, 'w', "utf-8") self._repo_name = repo_name self._repo_id = self._db_util.select_repo_id( self._cnx, repo_name, self._logger) self.add_file_info_to_json(references, repo_json) repo_json.close() self._db_util.close_connection(self._cnx) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("FileJSONExporter: process finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("FileJSONExporter failed", exc_info=True)
class BugzillaIssueDependency2Db(object): """ This class inserts the dependencies between Bugzilla issues """ def __init__(self, db_name, repo_id, issue_tracker_id, url, product, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type issue_tracker_id: int :param issue_tracker_id: the id of an existing issue tracker in the DB :type url: str :param url: the URL of the bugzilla issue tracker :type product: str :param product: the name of the product in the bugzilla issue tracker :type interval: list int :param interval: a list of issue ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._url = url self._product = product self._db_name = db_name self._repo_id = repo_id self._issue_tracker_id = issue_tracker_id self._interval = interval self._logging_util = LoggingUtil() self._config = config self._filehandler = None self._logger = None self._querier = None self._dao = None def __call__(self): try: log_path = self._log_root_path + "-issue2db-dependency" + \ str(self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._filehandler = self._logging_util.get_file_handler( self._logger, log_path, "info") self._querier = BugzillaQuerier(self._url, self._product, self._logger) self._dao = BugzillaDao(self._config, self._logger) self.extract() except Exception: self._logger.error("Issue2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _extract_single_issue_dependency(self, issue_id, data, type): # inserts issue dependency extracted = None if isinstance(data, int): extracted = data else: if "show_bug" in data: extracted = data.split("?id=")[1] if extracted: dependent_issue = self._dao.select_issue_id( extracted, self._issue_tracker_id, self._repo_id) if dependent_issue: self._dao.insert_issue_dependency(issue_id, dependent_issue, type) def _extract_issue_dependency(self, issue_id, obj, type): # processes issue dependencies if isinstance(obj, list): for issue in obj: self._extract_single_issue_dependency(issue_id, issue, type) else: self._extract_single_issue_dependency(issue_id, obj, type) def _is_duplicated(self, issue): flag = True try: issue.dupe_of except: flag = False return flag def _set_dependencies(self): cursor = self._dao.get_cursor() query = "SELECT i.id FROM issue i " \ "JOIN issue_tracker it ON i.issue_tracker_id = it.id " \ "WHERE i.id >= %s AND i.id <= %s AND issue_tracker_id = %s AND repo_id = %s" arguments = [ self._interval[0], self._interval[-1], self._issue_tracker_id, self._repo_id ] self._dao.execute(cursor, query, arguments) row = self._dao.fetchone(cursor) while row: try: issue_id = row[0] issue_own_id = self._dao.select_issue_own_id( issue_id, self._issue_tracker_id, self._repo_id) issue = self._querier.get_issue(issue_own_id) if issue.blocks: self._extract_issue_dependency( issue_id, self._querier.get_issue_blocks(issue), self._dao.get_issue_dependency_type_id("block")) if issue.depends_on: self._extract_issue_dependency( issue_id, self._querier.get_issue_depends_on(issue), self._dao.get_issue_dependency_type_id("depends")) if issue.see_also: self._extract_issue_dependency( issue_id, self._querier.get_issue_see_also(issue), self._dao.get_issue_dependency_type_id("related")) if self._is_duplicated(issue): if issue.dupe_of: self._extract_issue_dependency( issue_id, self._querier.get_issue_dupe_of(issue), self._dao.get_issue_dependency_type_id( "duplicated")) except Exception: self._logger.error( "something went wrong with the following issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) row = self._dao.fetchone(cursor) self._dao.close_cursor(cursor) def extract(self): """ extracts Bugzilla issue dependency data and stores it in the DB """ try: self._logger.info("BugzillaIssueDependency2Db started") start_time = datetime.now() self._set_dependencies() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("BugzillaIssueDependency2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._filehandler) except Exception: self._logger.error("BugzillaIssueDependency2Db failed", exc_info=True)
class Git2DbReference(object): """ This class handles the import of Git references """ #do not import patches LIGHT_IMPORT_TYPE = 1 #import patches but not at line level MEDIUM_IMPORT_TYPE = 2 #import patches also at line level FULL_IMPORT_TYPE = 3 def __init__(self, db_name, repo_id, git_repo_path, before_date, import_type, ref_name, ref_type, from_sha, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type git_repo_path: str :param git_repo_path: local path of the Git repository :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type import_type: int :param import_type: 1 does not import patches, 2 imports patches but not at line level, 3 imports patches with line detail :type ref_name: str :param ref_name: the name of the reference to import :type from_sha: str :param from_sha: the SHA of the commit from where to start the import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._git_repo_path = git_repo_path self._repo_id = repo_id self._db_name = db_name self._ref_name = ref_name self._ref_type = ref_type self._before_date = before_date self._import_type = import_type self._from_sha = from_sha self._config = config self._logging_util = LoggingUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): try: log_path = self._log_root_path + "-git2db-" + self._make_it_printable( self._ref_name) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") self._querier = GitQuerier(self._git_repo_path, self._logger) self._dao = GitDao(self._config, self._logger) self.extract() except Exception: self._logger.error("Git2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _make_it_printable(self, str): #converts string to UTF-8 and removes empty and non-alphanumeric characters u = str.decode('utf-8', 'ignore').lower() return re.sub(r'(\W|\s)+', '-', u) def _get_info_contribution_in_reference(self, reference_name, reference_type, repo_id, from_sha): if from_sha: if self._before_date: commits = self._querier.collect_all_commits_after_sha_before_date( reference_name, from_sha, self._before_date) else: commits = self._querier.collect_all_commits_after_sha( reference_name, from_sha) self._analyse_commits(commits, reference_name, repo_id) else: if self._before_date: commits = self._querier.collect_all_commits_before_date( reference_name, self._before_date) else: commits = self._querier.collect_all_commits(reference_name) self._analyse_commits(commits, reference_name, repo_id) def _load_all_references(self, repo_id): # load all git branches and tags into database for reference in self._querier.get_references(): ref_name = reference[0] ref_type = reference[1] #inserts reference to DB self._dao.insert_reference(repo_id, ref_name, ref_type) def _get_diffs_from_commit(self, commit, files_in_commit): #calculates diffs within files in a commit if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE: diffs = self._querier.get_diffs(commit, files_in_commit, True) else: diffs = self._querier.get_diffs(commit, files_in_commit, False) return diffs def _analyse_commit(self, commit, repo_id, ref_id): #analyses a commit try: message = self._querier.get_commit_property(commit, "message") author_name = self._querier.get_commit_property( commit, "author.name") author_email = self._querier.get_commit_property( commit, "author.email") committer_name = self._querier.get_commit_property( commit, "committer.name") committer_email = self._querier.get_commit_property( commit, "committer.email") size = self._querier.get_commit_property(commit, "size") sha = self._querier.get_commit_property(commit, "hexsha") authored_date = self._querier.get_commit_time( self._querier.get_commit_property(commit, "authored_date")) committed_date = self._querier.get_commit_time( self._querier.get_commit_property(commit, "committed_date")) if author_name is None and author_email is None: self._logger.warning( "author name and email are null for commit: " + sha) if committer_name is None and committer_email is None: self._logger.warning( "committer name and email are null for commit: " + sha) #insert author author_id = self._dao.get_user_id(author_name, author_email) committer_id = self._dao.get_user_id(committer_name, committer_email) commit_found = self._dao.select_commit_id(sha, repo_id) if not commit_found: #insert commit self._dao.insert_commit(repo_id, sha, message, author_id, committer_id, authored_date, committed_date, size) commit_found = self._dao.select_commit_id(sha, repo_id) commit_stats_files = commit.stats.files try: if self._querier.commit_has_no_parents(commit): for diff in self._querier.get_diffs_no_parent_commit( commit): file_path = diff[0] ext = self._querier.get_ext(file_path) self._dao.insert_file(repo_id, file_path, ext) file_id = self._dao.select_file_id( repo_id, file_path) if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE: patch_content = re.sub(r'^(\w|\W)*\n@@', '@@', diff[1]) else: patch_content = None stats = self._querier.get_stats_for_file( commit_stats_files, file_path) status = self._querier.get_status_with_diff( stats, diff) #insert file modification self._dao.insert_file_modification( commit_found, file_id, status, stats[0], stats[1], stats[2], patch_content) if self._import_type == Git2DbReference.FULL_IMPORT_TYPE: file_modification_id = self._dao.select_file_modification_id( commit_found, file_id) line_details = self._querier.get_line_details( patch_content, ext) for line_detail in line_details: self._dao.insert_line_details( file_modification_id, line_detail) else: for diff in self._get_diffs_from_commit( commit, commit_stats_files.keys()): #self.dao.check_connection_alive() if self._querier.is_renamed(diff): file_previous = self._querier.get_rename_from( diff) ext_previous = self._querier.get_ext( file_previous) file_current = self._querier.get_file_current( diff) ext_current = self._querier.get_ext( file_current) #insert new file self._dao.insert_file(repo_id, file_current, ext_current) #get id new file current_file_id = self._dao.select_file_id( repo_id, file_current) #retrieve the id of the previous file previous_file_id = self._dao.select_file_id( repo_id, file_previous) #insert file modification self._dao.insert_file_modification( commit_found, current_file_id, "renamed", 0, 0, 0, None) if not previous_file_id: self._dao.insert_file( repo_id, file_previous, ext_previous) previous_file_id = self._dao.select_file_id( repo_id, file_previous) if current_file_id == previous_file_id: self._logger.warning( "previous file id is equal to current file id (" + str(current_file_id) + ") " + str(sha)) else: file_modification_id = self._dao.select_file_modification_id( commit_found, current_file_id) self._dao.insert_file_renamed( repo_id, current_file_id, previous_file_id, file_modification_id) else: #insert file #if the file does not have a path, it won't be inserted try: file_path = self._querier.get_file_path( diff) ext = self._querier.get_ext(file_path) stats = self._querier.get_stats_for_file( commit_stats_files, file_path) status = self._querier.get_status_with_diff( stats, diff) #if the file is new, add it if self._querier.is_new_file(diff): self._dao.insert_file( repo_id, file_path, ext) file_id = self._dao.select_file_id( repo_id, file_path) if not file_id: self._dao.insert_file( repo_id, file_path, ext) file_id = self._dao.select_file_id( repo_id, file_path) if self._import_type > Git2DbReference.LIGHT_IMPORT_TYPE: #insert file modification (additions, deletions) patch_content = self._querier.get_patch_content( diff) else: patch_content = None self._dao.insert_file_modification( commit_found, file_id, status, stats[0], stats[1], stats[2], patch_content) if self._import_type == Git2DbReference.FULL_IMPORT_TYPE: file_modification_id = self._dao.select_file_modification_id( commit_found, file_id) line_details = self._querier.get_line_details( patch_content, ext) for line_detail in line_details: self._dao.insert_line_details( file_modification_id, line_detail) except Exception: self._logger.error( "Something went wrong with commit " + str(sha), exc_info=True) except Exception: self._logger.error("Git2Db failed on commit " + str(sha), exc_info=True) # insert parents of the commit self._dao.insert_commit_parents(commit.parents, commit_found, sha, repo_id) # insert commits in reference self._dao.insert_commit_in_reference(repo_id, commit_found, ref_id) #return commit_found except Exception: self._logger.error("Git2Db failed on commit " + str(sha), exc_info=True) def _analyse_commits(self, commits, ref, repo_id): #analyses commits in references ref_id = self._dao.select_reference_id(repo_id, ref) commits_in_reference = [] for c in commits: self._analyse_commit(c, repo_id, ref_id) # self.logger.info("analysing commit " + str(commits.index(c)+1) + "/" + str(len(commits))) # to_insert = self._analyse_commit(c, repo_id, ref_id) # if to_insert: # commits_in_reference.append((repo_id, to_insert, ref_id)) # self._analyse_commit(c, repo_id, ref_id) #self._dao.insert_commits_in_reference(commits_in_reference) def extract(self): """ extracts Git data and stores it in the DB """ try: self._logger.info("Git2DbReference started") start_time = datetime.now() self._load_all_references(self._repo_id) self._get_info_contribution_in_reference(self._ref_name, self._ref_type, self._repo_id, self._from_sha) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("Git2DbReference finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except Exception: self._logger.error("Git2DbReference failed", exc_info=True)
class SlackChannel2Db(object): """ This class handles the import of Slack channels """ def __init__(self, db_name, instant_messaging_id, interval, token, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type instant_messaging_id: int :param instant_messaging_id: the id of an existing instant messaging in the DB :type interval: list int :param interval: a list of channel ids to import :type token: str :param token: a Slack token :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._interval = interval self._db_name = db_name self._instant_messaging_id = instant_messaging_id self._token = token self._config = config self._logging_util = LoggingUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): try: log_path = self._log_root_path + "-channel2db-" + str(self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info") self._querier = SlackQuerier(self._token, self._logger) self._dao = SlackDao(self._config, self._logger) self.extract() except Exception: self._logger.error("Channel2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _insert_not_recognized_url_attachments(self, message_id, urls): #insert not recognized url attachments pos = 0 for url in urls: attachment_own_id = self._querier.generate_url_attachment_id(message_id, pos) attachment_name = self._querier.get_url_attachment_name(url) attachment_extension = self._querier.get_url_attachment_extension(url) self._dao.insert_url_attachment(attachment_own_id, message_id, attachment_name, attachment_extension, url) pos += 1 def _extract_file_attachment_info(self, message, message_id): #insert file attachments file = self._querier.get_file_attachment(message) own_id = self._querier.get_file_attachment_property(file, "id") name = self._querier.get_file_attachment_property(file, "name") extension = self._querier.get_file_attachment_property(file, "filetype") url = self._querier.get_file_attachment_property(file, "permalink") bytes = self._querier.get_file_attachment_property(file, "size") self._dao.insert_attachment(own_id, message_id, name, extension, bytes, url) def _extract_url_attachments(self, message, message_id): #insert URL attachments urls = self._querier.get_url_attachments(self._querier.get_message_body(message)) attachments = self._querier.get_message_attachments(message) for a in attachments: url = self._querier.get_attachment_url(a) name = self._querier.get_attachament_name(a) own_id = self._querier.get_attachment_id(a) extension = self._querier.get_attachment_extension(a) bytes = self._querier.get_attachment_size(a) self._dao.insert_attachment(own_id, message_id, name, extension, bytes, url) if url in urls: urls.remove(a.get('from_url')) self._insert_not_recognized_url_attachments(message_id, urls) def _extract_file_comment(self, channel_id, comment, pos): #insert file comment own_id = self._querier.get_comment_id(comment) body = self._querier.get_comment_body(comment) created_at = self._querier.get_comment_created_at(comment) author_name = self._querier.get_message_author_name(comment) author_email = self._querier.get_message_author_email(comment) author_id = self._dao.get_user_id(author_name, author_email) self._dao.insert_message(own_id, pos, self._dao.get_message_type_id("comment"), channel_id, body, author_id, created_at) comment_id = self._dao.select_message_id(own_id, channel_id) return comment_id def _extract_comment(self, message, channel_id): #insert comment pos = 0 message_id = None initial_comment = self._querier.file_attachment_get_comment(message) if initial_comment: own_id = self._querier.get_comment_id(initial_comment) message_id = self._dao.select_message_id(own_id, channel_id) pos = self._dao.get_comments(message_id) comment = self._querier.get_comment_message(message) comment_id = self._extract_file_comment(channel_id, comment, pos) if message_id: self._dao.insert_message_dependency(comment_id, message_id) def _extract_message(self, message, channel_id, type, pos): #insert message author_name = self._querier.get_message_author_name(message) author_email = self._querier.get_message_author_email(message) author_id = self._dao.get_user_id(author_name, author_email) body = self._querier.get_message_body(message) own_id = self._querier.get_message_own_id(message) created_at = self._querier.get_message_created_at(message) if type == "message": message_type = "reply" else: message_type = "info" self._dao.insert_message(own_id, pos, self._dao.get_message_type_id(message_type), channel_id, body, author_id, created_at) message_id = self._dao.select_message_id(own_id, channel_id) self._extract_url_attachments(message, message_id) def _extract_file_upload(self, message, channel_id, pos): #insert file upload own_id = self._querier.get_message_own_id(message) author_name = self._querier.get_message_author_name(message) author_email = self._querier.get_message_author_email(message) author_id = self._dao.get_user_id(author_name, author_email) created_at = self._querier.get_message_created_at(message) body = self._querier.get_message_body(message).split(':')[0] self._dao.insert_message(own_id, pos, self._dao.get_message_type_id("file_upload"), channel_id, body, author_id, created_at) message_id = self._dao.select_message_id(own_id, channel_id) self._extract_file_attachment_info(message, message_id) comment = self._querier.file_attachment_get_comment(message) if comment: comment_id = self._extract_file_comment(channel_id, comment, 0) self._dao.insert_message_dependency(comment_id, message_id) def _extract_messages(self, channel_id, channel_own_id): #insert messages pos = 0 for message in self._querier.get_channel_messages(channel_own_id): type = self._querier.get_message_type(message) if type == "file_comment": self._extract_comment(message, channel_id) elif type == "file_share": self._extract_file_upload(message, channel_id, pos) pos += 1 else: if not self._querier.is_bot_message(message): self._extract_message(message, channel_id, type, pos) #TODO deal with bot messages pos += 1 def extract(self): """ extracts Slack channel data and stores it in the DB """ try: self._logger.info("SlackChannel2Db started") start_time = datetime.now() for channel_id in self._interval: channel_own_id = self._dao.select_channel_own_id(channel_id, self._instant_messaging_id) self._extract_messages(channel_id, channel_own_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("SlackChannel2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except Exception: self._logger.error("SlackChannel2Db failed", exc_info=True)
class EclipseForum2DbUpdate(): """ This class handles the update of Eclipse forum data """ NUM_PROCESSES = 2 def __init__(self, db_name, project_name, forum_name, eclipse_forum_url, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type forum_name: str :param forum_name: the name of an existing forum in the DB to update :type eclipse_forum_url: str :param eclipse_forum_url: the URL of the forum :type num_processes: int :param num_processes: number of processes to import the data (default 2) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "update-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name self._project_name = project_name self._url = eclipse_forum_url self._db_name = db_name self._forum_name = forum_name config.update({'database': db_name}) self._config = config if num_processes: self._num_processes = num_processes else: self._num_processes = EclipseForum2DbUpdate.NUM_PROCESSES self._logging_util = LoggingUtil() self._date_util = DateUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _update_topics_info(self, forum_id): # update topics of a given forum next_page = True while next_page: topics_on_page = self._querier.get_topics() for topic in topics_on_page: topic_own_id = self._querier.get_topic_own_id(topic) topic_in_db = self._dao.get_topic_id(topic_own_id, forum_id) if topic_in_db: views = self._querier.get_topic_views(topic) last_change_at = self._date_util.get_timestamp( self._querier.get_last_change_at(topic), "%a, %d %B %Y %H:%M") self._dao.update_topic_info(topic_in_db, forum_id, views, last_change_at) next_page = self._querier.go_next_page() def _get_topics(self, forum_id): #update topics of a forum topic_ids = self._dao.get_topic_ids(forum_id) if topic_ids: self._update_topics_info(forum_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, self._num_processes) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_extractors, results) for interval in intervals: topic_extractor = EclipseTopic2Db(self._db_name, forum_id, interval, self._config, self._log_path) queue_extractors.put(topic_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def update(self): """ updates the Eclipse forum data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("EclipseForum2DbUpdate started") start_time = datetime.now() self._querier = EclipseForumQuerier(self._url, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self._querier.start_browser() project_id = self._dao.select_project_id(self._project_name) forum_id = self._dao.select_forum_id(self._forum_name, project_id) if forum_id: self._get_topics(forum_id) self._querier.close_browser() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseForum2DbUpdate finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("EclipseForum2DbUpdate failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class GitHubIssueDependency2Db(object): """ This class inserts the dependencies between GitHub issues """ def __init__(self, db_name, repo_id, issue_tracker_id, url, interval, token, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type issue_tracker_id: int :param issue_tracker_id: the id of an existing issue tracker in the DB :type url: str :param url: full name of the GitHub repository :type interval: list int :param interval: a list of issue ids to import :type token: str :param token: a GitHub token :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._url = url self._db_name = db_name self._repo_id = repo_id self._issue_tracker_id = issue_tracker_id self._interval = interval self._token = token self._config = config self._logging_util = LoggingUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): try: log_path = self._log_root_path + "-issue2db-dependency" + str(self._interval[0]) + \ "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info") self._querier = GitHubQuerier(self._url, self._token, self._logger) self._dao = GitHubDao(self._config, self._logger) self.extract() except Exception: self._logger.error("GitHubIssueDependency2Db failed", exc_info=True) def _extract_issue_dependencies(self): # inserts issue dependency cursor = self._dao.get_cursor() query = "SELECT i.id FROM issue i " \ "JOIN issue_tracker it ON i.issue_tracker_id = it.id " \ "WHERE i.id >= %s AND i.id <= %s AND issue_tracker_id = %s AND repo_id = %s" arguments = [self._interval[0], self._interval[-1], self._issue_tracker_id, self._repo_id] self._dao.execute(cursor, query, arguments) row = self._dao.fetchone(cursor) while row: try: issue_id = row[0] issue_own_id = self._dao.select_issue_own_id(issue_id, self._issue_tracker_id, self._repo_id) issue = self._querier.get_issue(issue_own_id) comments = [self._querier.get_issue_body(issue)] + [self._querier.get_issue_comment_body(comment) for comment in self._querier.get_issue_comments(issue)] for c in comments: if c: referenced_issues = self._querier.get_referenced_issues(c) for ri in referenced_issues: referenced_issue_id = self._dao.select_issue_id(ri, self._issue_tracker_id, self._repo_id) self._dao.insert_issue_dependency(referenced_issue_id, issue_own_id, self._dao.get_issue_dependency_type_id("related")) except Exception: self._logger.error("something went wrong with the following issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) row = self._dao.fetchone(cursor) self._dao.close_cursor(cursor) def extract(self): """ extracts GitHub issue dependency data and stores it in the DB """ try: self._logger.info("GitHubIssueDependency2Db started") start_time = datetime.now() self._extract_issue_dependencies() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("GitHubIssueDependency2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except Exception: self._logger.error("GitHubIssueDependency2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class Git2DbUpdate(): """ This class handles the update of Git data """ NUM_PROCESSES = 5 def __init__(self, db_name, project_name, repo_name, git_repo_path, before_date, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of the Git repository to import :type git_repo_path: str :param git_repo_path: the local path of the Git repository :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type num_processes: int :param num_processes: number of processes to import the data (default 5) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-git-" + db_name + "-" + project_name + "-" + repo_name self._git_repo_path = git_repo_path self._project_name = project_name self._db_name = db_name self._repo_name = repo_name self._before_date = before_date self._existing_refs = [] if num_processes: self._num_processes = num_processes else: self._num_processes = Git2DbUpdate.NUM_PROCESSES config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _update_existing_references(self, repo_id, import_type): #updates existing references in the DB cursor = self._dao.get_cursor() query = "SELECT c.sha, lc.ref_id " \ "FROM commit c " \ "JOIN (SELECT ref_id, max(commit_id) as last_commit_id_in_ref FROM commit_in_reference WHERE repo_id = %s GROUP BY ref_id) as lc " \ "ON c.id = lc.last_commit_id_in_ref" arguments = [repo_id] self._dao.execute(cursor, query, arguments) queue_references = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_references, results) row = self._dao.fetchone(cursor) while row: sha = row[0] ref_id = row[1] row = self._dao.fetchone(cursor) ref_name = self._dao.select_reference_name(repo_id, ref_id) for reference in self._querier.get_references(): reference_name = reference[0] if reference_name == ref_name: self._existing_refs.append(ref_name) git_ref_extractor = Git2DbReference(self._db_name, repo_id, self._git_repo_path, self._before_date, import_type, reference[0], sha, self._config, self._log_path) queue_references.put(git_ref_extractor) break self._dao.close_cursor(cursor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_references) # Wait for all of the tasks to finish queue_references.join() def _update_repo(self, repo_id, import_type): #updates Git data self._update_existing_references(repo_id, import_type) def _get_import_type(self, repo_id): #gets import type import_type = 1 import_type += self._dao.line_detail_table_is_empty(repo_id) + self._dao.file_modification_patch_is_empty(repo_id) return import_type def update(self): """ updates the Git data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info") self._logger.info("Git2DbUpdate started") start_time = datetime.now() self._querier = GitQuerier(self._git_repo_path, self._logger) self._dao = GitDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) repo_id = self._dao.select_repo_id(self._repo_name) self._update_repo(repo_id, self._get_import_type(repo_id)) self._dao.restart_connection() self._dao.fix_commit_parent_table(repo_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("Git2DbUpdate finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except: self._logger.error("Git2DbUpdate failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class GitHubIssue2Db(object): """ This class handles the import of GitHub issues """ def __init__(self, db_name, repo_id, issue_tracker_id, url, interval, token, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type issue_tracker_id: int :param issue_tracker_id: the id of an existing issue tracker in the DB :type url: str :param url: full name of the GitHub repository :type interval: list int :param interval: a list of issue ids to import :type token: str :param token: a GitHub token :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._url = url self._db_name = db_name self._repo_id = repo_id self._issue_tracker_id = issue_tracker_id self._interval = interval self._token = token self._config = config self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-issue2db-" + str(self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info") try: self._querier = GitHubQuerier(self._url, self._token, self._logger) self._dao = GitHubDao(self._config, self._logger) self.extract() except Exception: self._logger.error("GitHubIssue2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _insert_attachments(self, attachments, message_id): #inserts attachments pos = 0 for attachment in attachments: attachment_name = self._querier.get_attachment_name(attachment) attachment_own_id = self._querier.generate_attachment_id(message_id, pos) attachment_url = self._querier.get_attachment_url(attachment) self._dao.insert_attachment(attachment_own_id, message_id, attachment_name, attachment_url) pos += 1 def _find_mentioner_user(self, issue_own_id, actor, created_at): #finds the mentioner user mentioner = None issue = self._querier.get_issue(issue_own_id) candidates = [] if actor: if "@" + actor in self._querier.get_issue_body(issue): issue_creation = self._querier.get_issue_creation_time(issue) #if issue_creation <= created_at: candidates.append((self._querier.get_issue_creator(issue), issue_creation)) for c in self._querier.get_issue_comments(issue): if "@" + actor in self._querier.get_issue_comment_body(c): #if c.created_at <= created_at: candidates.append((c.user, c.created_at)) if candidates: found = min(candidates, key=lambda candidate: abs(candidate[1] - created_at)) mentioner = found[0] else: self._logger.warning("mentioner not found for issue " + str(issue_own_id)) #it may happen that the actor is not part of GitHub anymore, so in order to detect the mentioner, the datetime of the #mentioned event is compared with the creation times of the issues and comments else: if self._querier.get_issue_creation_time(issue) == created_at: mentioner = self._querier.get_issue_creator(issue) else: found = [c for c in self._querier.get_issue_comments(issue) if c.created_at == created_at] if found: if len(found) == 1: mentioner = found[0].user else: self._logger.warning("multiple mentioners for issue " + str(issue_own_id)) if not mentioner: self._logger.warning("mentioner not found for issue " + str(issue_own_id)) return mentioner def _extract_history(self, issue_id, issue_own_id, history): #inserts the history of an issue for event in history: try: created_at = self._querier.get_event_creation_time(event) actor = self._querier.get_event_actor(event) actor_id = self._dao.get_user_id(self._querier.get_user_name(actor), self._querier.get_user_email(actor)) action = event.event if action in ["opened", "edited", "closed", "reopened"]: self._dao.insert_event_type(action) event_type_id = self._dao.select_event_type(action) self._dao.insert_issue_event(issue_id, event_type_id, action, actor_id, created_at, None) elif action in ["labeled", "unlabeled"]: self._dao.insert_event_type(action) event_type_id = self._dao.select_event_type(action) self._dao.insert_issue_event(issue_id, event_type_id, event._rawData.get('label').get('name').lower(), actor_id, created_at, None) elif action in ["mentioned"]: self._dao.insert_event_type(action) event_type_id = self._dao.select_event_type(action) user_mentioner = self._find_mentioner_user(issue_own_id, self._querier.get_user_name(actor), created_at) user_id = self._dao.get_user_id(self._querier.get_user_name(user_mentioner), self._querier.get_user_email(user_mentioner)) self._dao.insert_issue_event(issue_id, event_type_id, self._querier.get_user_name(user_mentioner), user_id, created_at, actor_id) elif action in ["subscribed"]: self._dao.insert_event_type(action) event_type_id = self._dao.select_event_type(action) self._dao.insert_issue_event(issue_id, event_type_id, action, actor_id, created_at, None) elif action in ["assigned", "unassigned"]: self._dao.insert_event_type(action) event_type_id = self._dao.select_event_type(action) assignee_login = event._rawData.get('assignee').get('login') assignee = self._querier.find_user(assignee_login) if assignee: assignee_id = self._dao.get_user_id(self._querier.get_user_name(assignee), self._querier.get_user_email(assignee)) else: assignee_id = self._dao.get_user_id(assignee_login, None) assigner_login = event._rawData.get('assigner').get('login') assigner = self._querier.find_user(assigner_login) if assigner: assigner_id = self._dao.get_user_id(self._querier.get_user_name(assigner), self._querier.get_user_email(assigner)) else: assigner_id = self._dao.get_user_id(assigner_login, None) self._dao.insert_issue_event(issue_id, event_type_id, action, assigner_id, created_at, assignee_id) except Exception: self._logger.warning("event at (" + str(created_at) + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_subscribers(self, issue_id, subscribers): #inserts subscribers of an issue for subscriber in subscribers: try: subscriber_id = self._dao.get_user_id(self._querier.get_user_name(subscriber), self._querier.get_user_email(subscriber)) self._dao.insert_subscriber(issue_id, subscriber_id) except Exception: self._logger.warning("subscriber (" + subscriber.login + ") not inserted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_assignees(self, issue_id, assignees): #inserts the assignee of an issue for assignee in assignees: try: assignee_login = assignee.get('login') assignee = self._querier.find_user(assignee_login) if assignee: assignee_id = self._dao.get_user_id(self._querier.get_user_name(assignee), self._querier.get_user_email(assignee)) else: assignee_id = self._dao.get_user_id(assignee_login, None) self._dao.insert_assignee(issue_id, assignee_id) except Exception: self._logger.warning("assignee (" + assignee.login + ") not inserted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_first_comment(self, issue_id, issue): #inserts first issue comment created_at = self._querier.get_issue_creation_time(issue) author = self._querier.get_issue_creator(issue) author_id = self._dao.get_user_id(self._querier.get_user_name(author), self._querier.get_user_email(author)) body = self._querier.get_issue_body(issue) self._dao.insert_issue_comment(0, 0, self._dao.get_message_type_id("comment"), issue_id, body, None, author_id, created_at) def _extract_comments(self, issue_id, issue, comments): #inserts the comments of an issue self._extract_first_comment(issue_id, issue) pos = 1 for comment in comments: try: own_id = self._querier.get_issue_comment_id(comment) body = self._querier.get_issue_comment_body(comment) author = self._querier.get_issue_comment_author(comment) author_id = self._dao.get_user_id(self._querier.get_user_name(author), self._querier.get_user_email(author)) created_at = self._querier.get_issue_comment_creation_time(comment) self._dao.insert_issue_comment(own_id, pos, self._dao.get_message_type_id("comment"), issue_id, body, None, author_id, created_at) attachments = self._querier.get_attachments(body) if attachments: issue_comment_id = self._dao.select_issue_comment_id(own_id, issue_id, created_at) self._insert_attachments(attachments, issue_comment_id) except Exception: self._logger.warning("comment(" + str(pos) + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) continue pos += 1 def _extract_labels(self, issue_id, labels): #inserts the labels of an issue for label in labels: try: digested_label = re.sub("^\W+", "", re.sub("\W+$", "", label.lower())) self._dao.insert_label(digested_label.strip()) label_id = self._dao.select_label_id(digested_label) self._dao.assign_label_to_issue(issue_id, label_id) except Exception: self._logger.warning("label (" + label + ") not extracted for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _extract_issue_commit_dependency(self, issue_id, commits): #inserts the dependencies between an issue and commits for id in commits: commit_id = self._dao.select_commit(id, self._repo_id) if commit_id: self._dao.insert_issue_commit_dependency(issue_id, commit_id) def _get_issue_info(self, issue_own_id): #processes each single issue flag_insert_issue_data = False issue = self._querier.get_issue(issue_own_id) summary = self._querier.get_issue_summary(issue) component = None version = self._querier.get_issue_version(issue) hardware = None priority = None severity = None created_at = self._querier.get_issue_creation_time(issue) last_change_at = self._querier.get_issue_last_change_time(issue) reference_id = self._dao.find_reference_id(version, issue_own_id, self._repo_id) user = self._querier.get_issue_creator(issue) user_id = self._dao.get_user_id(self._querier.get_user_name(user), self._querier.get_user_email(user)) stored_issue_last_change = self._dao.select_last_change_issue(issue_own_id, self._issue_tracker_id, self._repo_id) if stored_issue_last_change: if last_change_at != stored_issue_last_change: flag_insert_issue_data = True self._dao.update_issue(issue_own_id, self._issue_tracker_id, summary, component, version, hardware, priority, severity, reference_id, last_change_at) else: flag_insert_issue_data = True self._dao.insert_issue(issue_own_id, self._issue_tracker_id, summary, component, version, hardware, priority, severity, reference_id, user_id, created_at, last_change_at) if flag_insert_issue_data: issue_id = self._dao.select_issue_id(issue_own_id, self._issue_tracker_id, self._repo_id) try: self._extract_labels(issue_id, self._querier.get_issue_tags(issue)) except Exception: self._logger.error("GitHubError when extracting tags for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) try: self._extract_comments(issue_id, issue, self._querier.get_issue_comments(issue)) except Exception: self._logger.error("GitHubError when extracting comments for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) try: issue_history = self._querier.get_issue_history(issue) self._extract_history(issue_id, issue_own_id, issue_history) self._extract_subscribers(issue_id, self._querier.get_issue_subscribers(issue_history)) self._extract_assignees(issue_id, self._querier.get_issue_assignees(issue_history)) self._extract_issue_commit_dependency(issue_id, self._querier.get_commit_dependencies(issue_history)) except Exception: self._logger.error("GitHubError when extracting history for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def _get_issues(self): #processes issues for issue_id in self._interval: try: self._get_issue_info(issue_id) except Exception: self._logger.error("something went wrong for issue id: " + str(issue_id) + " - tracker id " + str(self._issue_tracker_id), exc_info=True) def extract(self): """ extracts GitHub issue data and stores it in the DB """ try: self._logger.info("GitHubIssue2Db started") start_time = datetime.now() self._get_issues() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("GitHubIssue2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except Exception: self._logger.error("GitHubIssue2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class FileUtilWrapper(): """ This class wraps the operations provided by the FileUtil class """ def __init__(self, db_name, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_name: str :param repo_name: the name of an existing repository in the DB :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "file-util-wrapper" + db_name self._db_name = db_name config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = self._logging_util.get_logger(self._log_path) def get_file_history(self, repo_name, file_name, reference_name, reversed=False, before_date=None): """ get file history for a given file name within a reference. Optionally, the history can be retrieved before a given date :type repo_name: str :param repo_name: the name of an existing repository in the DB :type file_name: dict :param file_name: the name of the target file :type reference_name: str :param reference_name: the name of the reference :type reversed: bool :param reversed: if True, it returns the changes from the most recent to the earliest :type before_date: str (YYYY-mm-dd) :param reversed: if not null, it returns the last version of the file before the given date """ history = [] try: self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("FileUtilWrapper started") start_time = datetime.now() file_util = FileUtil(self._config, self._logger) history = file_util.get_file_history_by_name( repo_name, file_name, reference_name, reversed, before_date) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("FileUtilWrapper finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("FileUtilWrapper failed", exc_info=True) finally: return history def get_file_version(self, repo_name, file_name, reference_name, before_date=None): """ get file version for a given file name within a reference. Optionally, the version can be retrieved before a given date :type repo_name: str :param repo_name: the name of an existing repository in the DB :type file_name: dict :param file_name: the name of the target file :type reference_name: str :param reference_name: the name of the reference :type before_date: str (YYYY-mm-dd) :param reversed: if not null, it returns the last version of the file before the given date """ content = "" try: self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("FileUtilWrapper started") start_time = datetime.now() file_util = FileUtil(self._config, self._logger) content = file_util.get_file_version_by_name( repo_name, file_name, reference_name, before_date) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("FileUtilWrapper finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("FileUtil failed", exc_info=True) finally: return content
class GitHubUtil(): """ This class helps mapping the identities of the users in the vcs and GitHub """ def __init__(self, db_name, project_name, repo_name, github_repo_full_name, tokens, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of an existing repository in the DB :type url: str :param url: full name of the GitHub repository :type tokens: list str :param token: list of GitHub tokens :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "map-vcs-github-users-" + db_name + "-" + project_name + "-" + repo_name self._project_name = project_name self._db_name = db_name self._repo_name = repo_name self._tokens = tokens self._active_token = 0 self._url = github_repo_full_name config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = self._logging_util.get_logger(self._log_path) self._db_util = DbUtil() self._cnx = self._db_util.get_connection(self._config) self._git_dao = GitDao(self._config, self._logger) self._github_querier = GitHubQuerier(self._url, self._tokens[self._active_token], self._logger) def _change_token(self): if len(self._tokens) > 1: if not self._github_querier._token_util._is_usuable( self._tokens[self._active_token]): self._active_token = (self._active_token + 1) % len( self._tokens) self._github_querier = GitHubQuerier( self._url, self._tokens[self._active_token], self._logger) def _analyse_user(self, user, unmatched_user, sha): if user: user_name = self._github_querier.get_user_name(user) user_ids = self._db_util.select_all_user_ids_by_name( self._cnx, user_name, self._logger) for user_id in user_ids: try: user_id, alias_id = self._db_util._identify_user_and_alias( self._cnx, unmatched_user, user_id, self._logger) if user_id != alias_id: self._db_util.insert_user_alias( self._cnx, user_id, alias_id, self._logger) self._logger.info("user ids " + str(user_id) + " and " + str(alias_id) + " successfully matched") except Exception: self._logger.error("user ids " + str(user_id) + " and " + str(alias_id) + " not matched", exc_info=True) continue else: self._logger.warning("GitHub user not found for commit " + sha) def match(self): """ matches GitHub and Git identities """ try: self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("GitHubUtil started") start_time = datetime.now() repo_id = self._git_dao.select_repo_id(self._repo_name) user_ids = self._git_dao.select_all_developer_ids(repo_id) alias_ids = self._db_util.select_all_aliased_user_ids( self._cnx, self._logger) unmatched_users = list(set(user_ids) - set(alias_ids)) for unmatched_user in unmatched_users: matched = False sha = self._git_dao.select_sha_commit_by_user( unmatched_user, repo_id, match_on="author") if sha: author = self._github_querier.get_author_by_commit(sha) self._analyse_user(author, unmatched_user, sha) matched = True else: sha = self._git_dao.select_sha_commit_by_user( unmatched_user, repo_id, match_on="committer") if sha: committer = self._github_querier.get_committer_by_commit( sha) self._analyse_user(committer, unmatched_user, sha) matched = True if not matched: self._logger.warning("No commits found for user " + str(unmatched_user)) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("GitHubUtil finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("GitHubUtil failed", exc_info=True) finally: if self._git_dao: self._git_dao.close_connection() if self._cnx: self._db_util.close_connection(self._cnx)
class ActivityReportExporter(): """ This class handles the generation of reports """ LOG_FOLDER_PATH = "logs" INPUT_PATH = os.path.join(os.path.dirname(resources.__file__), 'queries.json') def __init__(self, config, db_name, log_root_path): """ :type config: dict :param config: the DB configuration file :type db_name: str :param config: name of an existing DB :type log_root_path: str :param log_root_path: the log path """ self._dsl_util = DslUtil() self._date_util = DateUtil() self._db_util = DbUtil() self._logging_util = LoggingUtil() self._log_path = log_root_path + "export-report-" + db_name + ".log" self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info") self._db_name = db_name self._config = config self._cnx = self._db_util.get_connection(self._config) self._db_util.set_database(self._cnx, self._db_name) self._db_util.set_settings(self._cnx) self._chart_generator = ChartGenerator(self._cnx, self._logger) self._html_generator = HtmlGenerator(self._logger) def _create_log_folder(self, name): #creates the log folder if not os.path.exists(name): os.makedirs(name) def _create_output_file(self, filename): #creates the output folder if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise def _load_report_exporter_json(self, json_path): #loads the JSON that drives the report export process with open(json_path) as json_data: data = json.load(json_data) return data.get('report') def _find_entity_id(self, type, name): #finds the id of the tools stored in the DB found = None if type == "project": found = self._db_util.select_project_id(self._cnx, name, self._logger) elif type == "repo": found = self._db_util.select_repo_id(self._cnx, name, self._logger) elif type == "issuetracker": found = self._db_util.select_issue_tracker_id(self._cnx, name, self._logger) elif type == "forum": found = self._db_util.select_forum_id(self._cnx, name, self._logger) elif type == "instantmessaging": found = self._db_util.select_instant_messaging_id(self._cnx, name, self._logger) if not found: self._logger.error("ReporExporter: entity " + str(type) + " with name " + str(name) + " not found!") return found def _get_parameter(self, key, parameters): #gets parameters of the JSON found = None if key in ["AFTERDATE", "INTERVAL"]: found = parameters.get(key.lower()) else: if key.endswith("ID"): found = parameters.get(key[:-2].lower()) if not found: self._logger.error("ReportExporter: parameter " + str(key) + " not found!") return found def _load_query_json(self, metric_name, parameters): #loads the queries in the JSON configuration file with open(ActivityReportExporter.INPUT_PATH) as json_data: data = json.load(json_data) metrics = data.get('queries') try: found = [m for m in metrics if m.get('name') == metric_name][0] query = found.get('query') for k in found.keys(): if k not in ['name', 'query']: k_value = str(self._get_parameter(k, parameters)) query = query.replace(k, k_value) return query except: self._logger.error("ReportExporter: metric " + str(metric_name) + " not found!") def _get_activity_name(self, activity): #gets the name of the activity return activity.replace("_", " ") def _get_activity_type(self, activity): #gets the type of the activity return activity.replace("_activity", "").replace("_", "") def _generate_charts(self, activity, activity_data, project_id, time_span): #generates charts entity2charts = {} after_date, interval = self._calculate_time_information(time_span) activity_type = self._get_activity_type(activity) names = activity_data.get('names') measures = activity_data.get('measures') for entity_name in names: entity_id = self._dsl_util.find_entity_id(self._cnx, activity_type, entity_name, self._logger) charts = [] for measure in measures: query = self._load_query_json(measure, {activity_type: entity_id, 'project': project_id, 'afterdate': after_date, 'interval': interval}) charts.append(self._chart_generator.create(query, interval.lower(), measure, time_span)) entity2charts.update({entity_name: charts}) return entity2charts def _calculate_time_information(self, time_span): #calculates the time span information start = None interval = None current_time = datetime.now() #test datetime.strptime("2015-10-10", "%Y-%m-%d") if time_span == "this_week": start = self._date_util.get_start_time_span(current_time, "week", "%Y-%m-%d") interval = "DAY" elif time_span == "this_month": start = self._date_util.get_start_time_span(current_time, "month", "%Y-%m-%d") interval = "DAY" elif time_span == "this_year": start = self._date_util.get_start_time_span(current_time, "year", "%Y-%m-%d") interval = "MONTH" else: self._logger.error("ReportExporter: time span " + str(time_span) + " not recognized! Options are: this_week, this_month, this_year") return start, interval def export(self, file_path, json_path): """ exports the Gitana data to a report :type file_path: str :param file_path: the path where to export the report :type json_path: str :param json_path: the path of the JSON that drives the export process """ try: self._logger.info("ReportExporter started") start_time = datetime.now() exporter_data = self._load_report_exporter_json(json_path) project_name = exporter_data.get('project') project_id = self._dsl_util.find_entity_id(self._cnx, "project", project_name, self._logger) time_span = exporter_data.get('time_span') activity2charts = {} for activity in [attr for attr in exporter_data.keys() if attr.endswith('activity')]: activity_name = self._get_activity_name(activity) charts = self._generate_charts(activity, exporter_data.get(activity), project_id, time_span) activity2charts.update({activity_name: charts}) html_page = self._html_generator.create(project_name, activity2charts) with codecs.open(file_path, 'w', encoding='utf8') as f: f.write(html_page) self._db_util.close_connection(self._cnx) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("ReportExporter: process finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except: self._logger.error("ReportExporter failed", exc_info=True)
class Slack2DbMain(): """ This class handles the import of Slack data """ def __init__(self, db_name, project_name, type, instant_messaging_name, before_date, channels, tokens, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type type: str :param type: type of the instant messaging (Slack, IRC) :type instant_messaging_name: str :param instant_messaging_name: the name of the instant messaging to import :type channels: list str :param channels: list of channels to import :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type tokens: list str :param tokens: list of Slack tokens :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-slack-" + db_name + "-" + project_name + "-" + instant_messaging_name self._type = type self._instant_messaging_name = instant_messaging_name self._project_name = project_name self._db_name = db_name self._channels = channels self._before_date = before_date self._tokens = tokens config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None def _get_channel_ids(self, instant_messaging_id): #get data source channel ids channel_ids = [] channel_own_ids = self._querier.get_channel_ids( self._before_date, self._channels) for own_id in channel_own_ids: channel = self._querier.get_channel(own_id) last_change_at = self._querier.get_channel_last_change_at(channel) if self._dao.get_channel_last_change_at( own_id, instant_messaging_id) != last_change_at: name = self._querier._get_channel_name(channel) description = self._querier.get_channel_description(channel) created_at = self._querier._get_channel_created_at(channel) channel_id = self._dao.insert_channel(own_id, instant_messaging_id, name, description, created_at, last_change_at) channel_ids.append(channel_id) return channel_ids def _get_channels(self, instant_messaging_id): #processes Slack channels channel_ids = self._get_channel_ids(instant_messaging_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( channel_ids, len(self._tokens)) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) pos = 0 for interval in intervals: topic_extractor = SlackChannel2Db(self._db_name, instant_messaging_id, interval, self._tokens[pos], self._config, self._log_path) queue_extractors.put(topic_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join() def extract(self): """ extracts Slack data and stores it in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("SlackDbMain started") start_time = datetime.now() self._querier = SlackQuerier(self._tokens[0], self._logger) self._dao = SlackDao(self._config, self._logger) project_id = self._dao.select_project_id(self._project_name) instant_messaging_id = self._dao.insert_instant_messaging( project_id, self._instant_messaging_name, self._type) self._get_channels(instant_messaging_id) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("SlackDbMain extract finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("SlackDbMain extract failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class GitHubIssue2DbUpdate(): """ This class handles the update of GitHub issue tracker data """ NUM_PROCESSES = 5 def __init__(self, db_name, project_name, repo_name, issue_tracker_name, url, tokens, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type repo_name: str :param repo_name: the name of an existing repository in the DB :type issue_tracker_name: str :param issue_tracker_name: the name of the issue tracker to import :type url: str :param url: full name of the GitHub repository :type tokens: list str :param token: list of GitHub tokens :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-github-" + db_name + "-" + project_name + "-" + issue_tracker_name self._issue_tracker_name = issue_tracker_name self._url = url self._project_name = project_name self._db_name = db_name self._repo_name = repo_name self._tokens = tokens config.update({'database': db_name}) self._config = config self._logging_util = LoggingUtil() self._logger = None self._fileHandler = None self._dao = None def _update_issue_content(self, repo_id, issue_tracker_id, intervals, url): # updates issues already stored in the DB queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_intervals, results) pos = 0 for interval in intervals: issue_extractor = GitHubIssue2Db(self._db_name, repo_id, issue_tracker_id, url, interval, self._tokens[pos], self._config, self._log_path) queue_intervals.put(issue_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _update_issue_dependency(self, repo_id, issue_tracker_id, intervals, url): # updates issue dependencies already stored in the DB queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_intervals, results) pos = 0 for interval in intervals: issue_dependency_extractor = GitHubIssueDependency2Db( self._db_name, repo_id, issue_tracker_id, url, interval, self._tokens[pos], self._config, self._log_path) queue_intervals.put(issue_dependency_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_intervals) # Wait for all of the tasks to finish queue_intervals.join() def _update_issues(self): # updates issues project_id = self._dao.select_project_id(self._project_name) repo_id = self._dao.select_repo_id(project_id, self._repo_name) issue_tracker_id = self._dao.select_issue_tracker_id( repo_id, self._issue_tracker_name) issue_tracker_url = self._url if issue_tracker_id: imported = self._dao.get_already_imported_issue_ids( issue_tracker_id, repo_id) if imported: intervals = [ i for i in multiprocessing_util.get_tasks_intervals( imported, len(self._tokens)) if len(i) > 0 ] self._update_issue_content(repo_id, issue_tracker_id, intervals, issue_tracker_url) self._update_issue_dependency(repo_id, issue_tracker_id, intervals, issue_tracker_url) def update(self): """ updates the GitHub issue tracker data stored in the DB """ try: self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._logger.info("GitHubIssue2DbUpdate started") start_time = datetime.now() self._dao = GitHubDao(self._config, self._logger) self._update_issues() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("GitHubIssue2DbUpdate finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except: self._logger.error("GitHubIssue2DbUpdate failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
class EclipseTopic2Db(object): """ This class handles the import of Eclipse forum topics """ TOPIC_URL = 'https://www.eclipse.org/forums/index.php/t/' def __init__(self, db_name, forum_id, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type forum_id: int :param forum_id: the id of an existing forum in the DB :type interval: list int :param interval: a list of topic ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._interval = interval self._db_name = db_name self._forum_id = forum_id self._config = config self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-topic2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._querier = EclipseForumQuerier(None, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self.extract() except Exception: self._logger.error("EclipseTopic2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _get_message_attachments_info(self, message_id, message): #get attachment informatio of messages attachments = self._querier.message_get_attachments(message) for a in attachments: url = self._querier.get_attachment_url(a) own_id = self._querier.get_attachment_own_id(a) name = self._querier.get_attachment_name(a) extension = name.split('.')[-1].strip('').lower() size = self._querier.get_attachment_size(a) self._dao.insert_message_attachment(url, own_id, name, extension, size, message_id) def _get_message_info(self, topic_id, message, pos): #get information of topic messages own_id = self._querier.get_message_own_id(message) created_at = self._date_util.get_timestamp( self._querier.get_created_at(message), "%a, %d %B %Y %H:%M") body = self._querier.get_message_body(message) author_name = self._querier.get_message_author_name(message) message_id = self._dao.insert_message( own_id, pos, self._dao.get_message_type_id("reply"), topic_id, body, None, self._dao.get_user_id(author_name), created_at) if self._querier.message_has_attachments(message): self._get_message_attachments_info(message_id, message) if pos == 1: self._dao.update_topic_created_at(topic_id, created_at, self._forum_id) def extract(self): """ extracts Eclipse forum topic data and stores it in the DB """ self._logger.info("EclipseTopic2Db started") start_time = datetime.now() for topic_id in self._interval: topic_own_id = self._dao.get_topic_own_id(self._forum_id, topic_id) self._querier.set_url(EclipseTopic2Db.TOPIC_URL + str(topic_own_id) + "/") self._querier.start_browser() time.sleep(3) if 'index.php/e/' in self._querier._url: self._logger.warning("No URL exists for the topic id " + str(topic_id) + " - " + str(self._forum_id)) next_page = True pos = 1 while next_page: messages_on_page = self._querier.get_messages() for message in messages_on_page: self._get_message_info(topic_id, message, pos) pos += 1 next_page = self._querier.go_next_page() self._querier.close_browser() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseTopic2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
class Code2DbCommitFile(): """ This class handles the import of code function data for a set of commit file pairs """ #import overall function statistics per file LIGHT_IMPORT_TYPE = 1 #import import function-level information FULL_IMPORT_TYPE = 2 def __init__(self, db_name, git_repo_path, interval, import_type, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type git_repo_path: str :param git_repo_path: local path of the Git repository :type interval: list dict :param interval: a list of commit file pair :type import_type: int :param import_type: 1 = import overall function statistics per file, 2 = import function-level information :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._git_repo_path = git_repo_path self._db_name = db_name self._interval = interval self._import_type = import_type self._config = config self._fileHandler = None self._logger = None self._git_querier = None self._code_querier = None self._dao = None self._tmp_root_file = None def __call__(self): self._logging_util = LoggingUtil() log_path = self._log_root_path + "-code2db-" + str( self._interval[0].get('commit_id')) + "_" + str( self._interval[0].get('file_id')) + "-" + str( self._interval[-1].get('commit_id')) + "_" + str( self._interval[-1].get('file_id')) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._tmp_root_file = log_path + "-tmp." self._git_querier = GitQuerier(self._git_repo_path, self._logger) self._code_querier = CodeQuerier(self._logger, self._tmp_root_file + "txt") self._dao = GitDao(self._config, self._logger) self.extract() except Exception: self._logger.error("Code2DbTag failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _save_content(self, content, target): file = codecs.open(target, "w+", "utf-8") file.write(content) file.close() def _delete_tmp_files(self, targets): for target in targets: if os.path.exists(target): os.remove(target) def _process_commit_file(self): _tmp_files = set() for i in self._interval: try: commit_id = i.get("commit_id") commit_sha = i.get("commit_sha") file_id = i.get("file_id") file_name = i.get("file_name") file_ext = i.get("file_ext") found = self._dao.select_code_at_commit(commit_id, file_id) if file_ext not in CodeQuerier.FORBIDDEN_EXTENSIONS and not found: check_extension = False if file_ext in CodeQuerier.ALLOWED_EXTENSIONS: check_extension = True if not file_ext: file_ext = "unknown" _tmp_file = self._tmp_root_file + file_ext _tmp_files.add(_tmp_file) file_content_at_revision = self._git_querier.get_file_content( commit_sha, file_name) if file_content_at_revision: self._save_content(file_content_at_revision, _tmp_file) if check_extension: file_info, fun_info = self._code_querier.get_complexity_info( _tmp_file, self._import_type) self._dao.insert_code_at_commit( commit_id, file_id, file_info.get('ccn'), file_info.get('loc'), file_info.get('comments'), file_info.get('blanks'), file_info.get('funs'), file_info.get('tokens'), file_info.get('avg_ccn'), file_info.get('avg_loc'), file_info.get('avg_tokens')) if self._import_type == Code2DbCommitFile.FULL_IMPORT_TYPE: for fi in fun_info: self._dao.insert_function( fi.get('name'), file_id, fi.get('args'), fi.get('loc'), fi.get('tokens'), fi.get('lines'), fi.get('ccn'), fi.get('start'), fi.get('end')) fun_id = self._dao.select_function_id( file_id, fi.get('start'), fi.get('end')) self._dao.insert_function_at_commit( fun_id, commit_id) else: file_info = self._code_querier.get_comment_info( _tmp_file) self._dao.insert_code_at_commit( commit_id, file_id, None, file_info.get('loc'), file_info.get('comments'), file_info.get('blanks'), None, None, None, None, None) if len(_tmp_files) >= 20: self._delete_tmp_files(_tmp_files) except Exception: self._logger.error("Code2DbCommitFile failed on pair " + str(commit_sha) + ", " + str(file_name), exc_info=True) if _tmp_files: self._delete_tmp_files(_tmp_files) def extract(self): """ extracts code function data and stores it in the DB """ try: self._logger.info("Code2DbCommitFile started") start_time = datetime.now() self._process_commit_file() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("Code2DbCommitFile finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger( self._logger, self._fileHandler) except Exception, e: self._logger.error("Code2DbCommitFile failed", exc_info=True)