def identify_bugs_in_commits(input_file, output_file=None, project=None): logger.info('Identifying bugs in commits') logger.debug('Identifying for project: %s',project) # Load the xml-file that contains the commits logger.debug('Loading the file: %s', input_file) xml_parser = etree.XMLParser(remove_blank_text=True, huge_tree=True) xml_tree = etree.parse(input_file, parser=xml_parser) xml_root = xml_tree.getroot() logger.debug('Total number of commits: %s', len(xml_root)) # Get the commits that are related to bugs output_root = etree.Element('log') for xml_commit in xml_root: identified_bugs = get_bugs_in_commit(xml_commit, project) xml_identified_bugs = etree.Element('identified_bugs') for bug_id in identified_bugs: # Add it to the commit xml_bug = etree.Element('bug') xml_bug.set('bug_id', bug_id) xml_identified_bugs.append(xml_bug) if len(xml_identified_bugs) > 0: xml_commit.append(xml_identified_bugs) output_root.append(xml_commit) logger.debug('Number of commits with identified-bugs: %s', len(output_root)) if output_file is None: return output_root else: tree_as_string = etree.tostring(output_root, encoding='unicode', pretty_print=True) store_data_into_file(tree_as_string, output_file)
def do_main(): parser = argparse.ArgumentParser() parser.add_argument( "--clone-repos", metavar=("<file-repos-urls>", "<output-dir>"), nargs=2, help="Clone the repositories listed in file-repos-urls" + " and store them into the output-dir directory", ) parser.add_argument( "--repo-type", choices=["git", "hg"], required=True, help="Specify the type of version-contro to work with" ) parser.add_argument( "--extract-commits", metavar=("<repo-path>", "<output-file>"), nargs=2, help="Extract the commits of the repository and" + " and store it as a xml into output-file", ) args = parser.parse_args() # Execute the options if args.clone_repos: file_repos_urls, output_dir = args.clone_repos clone_git_repositories(file_repos_urls, output_dir) if args.extract_commits: repo_path, output_file = args.extract_commits xml_tree = get_hg_logs_as_xml(repo_path) tree_as_string = etree.tostring(xml_tree, encoding="unicode", pretty_print=True) store_data_into_file(tree_as_string, output_file)
def extract_non_bug_related_commits(input_file, output_file=None, project=None): logger.info('Extracting the commits that are NOT related to bugs') if project is not None: logger.debug('Extracting for project: %s',project) # Load the xml-file that contains the commits logger.debug('Loading the file: %s', input_file) xml_parser = etree.XMLParser(remove_blank_text=True, huge_tree=True) xml_tree = etree.parse(input_file, parser=xml_parser) xml_root = xml_tree.getroot() logger.debug('Total number of commits: %s', len(xml_root)) # Get the commits that are related to bugs output_root = etree.Element('log') for xml_commit in xml_root: if not commit_is_bug_related(xml_commit, project): output_root.append(xml_commit) logger.debug('Number of NON-bug-related-commits: %s', len(output_root)) if output_file is None: return output_root else: tree_as_string = etree.tostring(output_root, encoding='unicode', pretty_print=True) store_data_into_file(tree_as_string, output_file)
def escape_tag_from_invalid_xml_file(filename_invalid_xml, tag, new_tag=None, new_filename_xml=None): """ Escape a section (tag) in a invalid xml file. Optionally, it renames the section with new_tag """ logger.debug('Escaping the tag: "%s" in file: %s'%(tag, filename_invalid_xml)) content_buffer = list() tag_buffer = list() rex_tag_data = re.compile((r'^<{0}(.*?)>(.*?)</{0}>|' r'^<{0}(.*?)>').format(tag), re.S|re.M) with open(filename_invalid_xml, 'r') as f: line_id = 0L for line in f: # Get the line as unicode line = line.decode("utf-8").replace(u'\n',u'',1) line_id += 1 # Check if a tag-section is not being processed currently if len(tag_buffer) == 0: # try to match the entire section match = rex_tag_data.search(line) if match is None: content_buffer.append(line) else: # Begin processing a tag-section tag_buffer.append(line) # Check if the tag has reached its end attrs_1, content, attrs_2 = match.groups() if content is None: tag_has_ended = False else: tag_has_ended = True else: # Check if the tag-section not has reached its end if not tag_has_ended and not line.endswith('</%s>'%tag): tag_buffer.append(line) elif line.endswith('</%s>'%tag) : tag_buffer.append(line) # Process the tag-section if tag_has_ended or line.endswith('</%s>'%tag): # Get tag-content by removing the tag-content raw_tag = u'\n'.join(tag_buffer) match = rex_tag_data.search(raw_tag) if match: attrs_1, unescaped_content, attrs_2 = match.groups() attrs_data = attrs_1 or attrs_2 or '' else: error_msg = ('Error matching the tag: %s ' 'at line: %s, with raw-content:\n%s')%(tag, line_id, raw_tag) logger.error(error_msg) raise Exception(error_msg) # Convert the raw-content into a valid-xml-content try: unescaped_content = escape_xml_illegal_chars(unescaped_content, ' ') if new_tag is None: new_tag = tag dummy_element = etree.Element(new_tag) dummy_element.text = unescaped_content except Exception as ex: error_msg = ('Error escaping the tag: %s ' 'at line: %s, with content:\n%s')%(tag, line_id, unescaped_content) logger.error(error_msg) raise ex escaped_tag = etree.tounicode(dummy_element) # Add the attribute-info if attrs_data is not None and attrs_data.strip() != '': attrs_data = escape_xml_illegal_chars(attrs_data, ' ') attrs_data = escape(attrs_data) escaped_tag = escaped_tag.replace(u'<%s>'%new_tag, u'<%s%s>'%(new_tag, attrs_data)) # include to the main buffer content_buffer.extend(escaped_tag.splitlines()) tag_buffer = list() if tag_has_ended: content_buffer.append(line) # Store it again if new_filename_xml is None: new_filename_xml = filename_invalid_xml store_data_into_file(u'\n'.join(content_buffer), new_filename_xml)