def _parse_git_diff(self, linenum): """Parse a Git-style diff header. This will parse a diff header containing file mode information, file operations, and ``diff --git`` lines, and filename information. Args: linenum (int): The current line number. Returns: tuple: A tuple containing the following: 1. The next line number to parse. 2. The populated :py:class:`ParsedDiffFile` instance for this file, if any. """ lines = self.lines # First check if it is a new file with no content, a file mode # change with no content, or a deleted file with no content. If so, # we'll skip this diff. start_linenum = linenum diff_git_line = lines[linenum] linenum += 1 # Check to make sure we haven't reached the end of the diff. if linenum >= len(lines): return linenum, None file_info = ParsedDiffFile(parsed_diff_change=self.parsed_diff_change) file_info.append_data(diff_git_line) file_info.append_data(b'\n') file_info.binary = False # Assume the blob / commit information is provided globally. If # we found an index header we'll override this. file_info.orig_file_details = self.base_commit_id file_info.modified_file_details = self.new_commit_id headers, linenum = self._parse_extended_headers(linenum) # Determine the created/deleted/modified state and accompanying UNIX # file mode. if self._is_new_file(headers): new_mode_header = headers[b'new file mode'][1] file_info.append_data(new_mode_header) file_info.orig_file_details = PRE_CREATION file_info.new_unix_mode = self._parse_unix_mode(new_mode_header) elif self._is_deleted_file(headers): old_mode_header = headers[b'deleted file mode'][1] file_info.append_data(old_mode_header) file_info.deleted = True file_info.old_unix_mode = self._parse_unix_mode(old_mode_header) elif self._is_mode_change(headers): old_mode_header = headers[b'old mode'][1] new_mode_header = headers[b'new mode'][1] file_info.append_data(old_mode_header) file_info.append_data(new_mode_header) file_info.old_unix_mode = self._parse_unix_mode(old_mode_header) file_info.new_unix_mode = self._parse_unix_mode(new_mode_header) # Determine whether the file has been moved or copied, and track # that information. if self._is_moved_file(headers): file_info.orig_filename = headers[b'rename from'][0] file_info.modified_filename = headers[b'rename to'][0] file_info.moved = True if b'similarity index' in headers: file_info.append_data(headers[b'similarity index'][1]) file_info.append_data(headers[b'rename from'][1]) file_info.append_data(headers[b'rename to'][1]) elif self._is_copied_file(headers): file_info.orig_filename = headers[b'copy from'][0] file_info.modified_filename = headers[b'copy to'][0] file_info.copied = True if b'similarity index' in headers: file_info.append_data(headers[b'similarity index'][1]) file_info.append_data(headers[b'copy from'][1]) file_info.append_data(headers[b'copy to'][1]) # Assume by default that the change is empty. If we find content # later, we'll clear this. empty_change = True if b'index' in headers: index_header_pair = headers[b'index'] index_range = index_header_pair[0].split()[0] index_header = index_header_pair[1] if b'..' in index_range: (file_info.orig_file_details, file_info.modified_file_details) = index_range.split(b'..') if self.pre_creation_regexp.match(file_info.orig_file_details): file_info.orig_file_details = PRE_CREATION file_info.append_data(index_header) unix_mode = self._parse_unix_mode(index_header) if unix_mode is not None: # This will overwrite anything set above. In theory, a Git # diff shouldn't have multiple (conflicting) mode lines. file_info.old_unix_mode = unix_mode file_info.new_unix_mode = unix_mode changes_linenum = None # Get the changes while linenum < len(lines): if self._is_git_diff(linenum): break elif self._is_binary_patch(linenum): file_info.binary = True file_info.append_data(lines[linenum]) file_info.append_data(b'\n') empty_change = False linenum += 1 break elif self._is_diff_fromfile_line(linenum): orig_line = lines[linenum] new_line = lines[linenum + 1] orig_filename = orig_line[len(b'--- '):] new_filename = new_line[len(b'+++ '):] # Some diffs may incorrectly contain filenames listed as: # # --- filename\t # +++ filename\t # # We need to strip those single trailing tabs. if orig_filename.endswith(b'\t'): orig_filename = orig_filename[:-1] if new_filename.endswith(b'\t'): new_filename = new_filename[:-1] # Strip the Git a/ and b/ prefixes, if set in the diff. if orig_filename.startswith(b'a/'): orig_filename = orig_filename[2:] if new_filename.startswith(b'b/'): new_filename = new_filename[2:] if orig_filename == b'/dev/null': file_info.orig_file_details = PRE_CREATION file_info.orig_filename = new_filename else: file_info.orig_filename = orig_filename if new_filename == b'/dev/null': file_info.modified_filename = orig_filename else: file_info.modified_filename = new_filename file_info.append_data(orig_line) file_info.append_data(b'\n') file_info.append_data(new_line) file_info.append_data(b'\n') linenum += 2 changes_linenum = linenum else: empty_change = False linenum = self.parse_diff_line(linenum, file_info) # Now that we have the UNIX file mode and changed lines, we can # determine if this is a symlink. We need to check the new and old # UNIX modes. mode_for_symlink = (file_info.new_unix_mode or file_info.old_unix_mode) if (mode_for_symlink is not None and stat.S_ISLNK(int(mode_for_symlink, 8))): file_info.is_symlink = True if changes_linenum is not None: for i in range(changes_linenum, linenum): line = lines[i] if line.startswith(b'-'): file_info.old_symlink_target = line[1:].strip() elif line.startswith(b'+'): file_info.new_symlink_target = line[1:].strip() if not file_info.orig_filename: # This file didn't have any --- or +++ lines. This usually means # the file was deleted or moved without changes. We'll need to # fall back to parsing the diff --git line, which is more # error-prone. assert not file_info.modified_filename self._parse_diff_git_line(diff_git_line, file_info, linenum) # For an empty change, we keep the file's info only if it is a new # 0-length file, a moved file, a copied file, or a deleted 0-length # file. # # TODO: In the future, we'll want to keep empty files so we can show # metadata changes, once that functionality is available in the # diff viewer. if (empty_change and file_info.orig_file_details != PRE_CREATION and not (file_info.moved or file_info.copied or file_info.deleted)): # We didn't find any interesting content, so leave out this # file's info. # # Note that we may want to change this in the future to preserve # data like mode changes, but that will require filtering out # empty changes at the diff viewer level in a sane way. file_info.discard() file_info = None return linenum, file_info
def _parse_git_diff(self, linenum): # First check if it is a new file with no content or # a file mode change with no content or # a deleted file with no content # then skip start_linenum = linenum # Now we have a diff we are going to use so get the filenames + commits diff_git_line = self.lines[linenum] file_info = ParsedDiffFile() file_info.append_data(diff_git_line) file_info.append_data(b'\n') file_info.binary = False linenum += 1 # Check to make sure we haven't reached the end of the diff. if linenum >= len(self.lines): return linenum, None # Assume the blob / commit information is provided globally. If # we found an index header we'll override this. file_info.orig_file_details = self.base_commit_id file_info.modified_file_details = self.new_commit_id headers, linenum = self._parse_extended_headers(linenum) for line in self.lines[start_linenum:linenum]: m = GitDiffParser.FILE_MODE_RE.search(line) if m: mode = int(m.group('mode'), 8) if stat.S_ISLNK(mode): file_info.is_symlink = True break if self._is_new_file(headers): file_info.append_data(headers[b'new file mode'][1]) file_info.orig_file_details = PRE_CREATION elif self._is_deleted_file(headers): file_info.append_data(headers[b'deleted file mode'][1]) file_info.deleted = True elif self._is_mode_change(headers): file_info.append_data(headers[b'old mode'][1]) file_info.append_data(headers[b'new mode'][1]) if self._is_moved_file(headers): file_info.orig_filename = headers[b'rename from'][0] file_info.modified_filename = headers[b'rename to'][0] file_info.moved = True if b'similarity index' in headers: file_info.append_data(headers[b'similarity index'][1]) file_info.append_data(headers[b'rename from'][1]) file_info.append_data(headers[b'rename to'][1]) elif self._is_copied_file(headers): file_info.orig_filename = headers[b'copy from'][0] file_info.modified_filename = headers[b'copy to'][0] file_info.copied = True if b'similarity index' in headers: file_info.append_data(headers[b'similarity index'][1]) file_info.append_data(headers[b'copy from'][1]) file_info.append_data(headers[b'copy to'][1]) # Assume by default that the change is empty. If we find content # later, we'll clear this. empty_change = True if b'index' in headers: index_range = headers[b'index'][0].split()[0] if b'..' in index_range: (file_info.orig_file_details, file_info.modified_file_details) = index_range.split(b'..') if self.pre_creation_regexp.match(file_info.orig_file_details): file_info.orig_file_details = PRE_CREATION file_info.append_data(headers[b'index'][1]) # Get the changes while linenum < len(self.lines): if self._is_git_diff(linenum): break elif self._is_binary_patch(linenum): file_info.binary = True file_info.append_data(self.lines[linenum]) file_info.append_data(b'\n') empty_change = False linenum += 1 break elif self._is_diff_fromfile_line(linenum): orig_line = self.lines[linenum] new_line = self.lines[linenum + 1] orig_filename = orig_line[len(b'--- '):] new_filename = new_line[len(b'+++ '):] # Some diffs may incorrectly contain filenames listed as: # # --- filename\t # +++ filename\t # # We need to strip those single trailing tabs. if orig_filename.endswith(b'\t'): orig_filename = orig_filename[:-1] if new_filename.endswith(b'\t'): new_filename = new_filename[:-1] # Strip the Git a/ and b/ prefixes, if set in the diff. if orig_filename.startswith(b'a/'): orig_filename = orig_filename[2:] if new_filename.startswith(b'b/'): new_filename = new_filename[2:] if orig_filename == b'/dev/null': file_info.orig_file_details = PRE_CREATION file_info.orig_filename = new_filename else: file_info.orig_filename = orig_filename if new_filename == b'/dev/null': file_info.modified_filename = orig_filename else: file_info.modified_filename = new_filename file_info.append_data(orig_line) file_info.append_data(b'\n') file_info.append_data(new_line) file_info.append_data(b'\n') linenum += 2 else: empty_change = False linenum = self.parse_diff_line(linenum, file_info) if not file_info.orig_filename: # This file didn't have any --- or +++ lines. This usually means # the file was deleted or moved without changes. We'll need to # fall back to parsing the diff --git line, which is more # error-prone. assert not file_info.modified_filename self._parse_diff_git_line(diff_git_line, file_info, linenum) # For an empty change, we keep the file's info only if it is a new # 0-length file, a moved file, a copied file, or a deleted 0-length # file. if (empty_change and file_info.orig_file_details != PRE_CREATION and not (file_info.moved or file_info.copied or file_info.deleted)): # We didn't find any interesting content, so leave out this # file's info. # # Note that we may want to change this in the future to preserve # data like mode changes, but that will require filtering out # empty changes at the diff viewer level in a sane way. file_info = None return linenum, file_info
def _parse_git_diff(self, linenum): # First check if it is a new file with no content or # a file mode change with no content or # a deleted file with no content # then skip start_linenum = linenum # Now we have a diff we are going to use so get the filenames + commits diff_git_line = self.lines[linenum] file_info = ParsedDiffFile() file_info.append_data(diff_git_line) file_info.append_data(b'\n') file_info.binary = False linenum += 1 # Check to make sure we haven't reached the end of the diff. if linenum >= len(self.lines): return linenum, None # Assume the blob / commit information is provided globally. If # we found an index header we'll override this. file_info.origInfo = self.base_commit_id file_info.newInfo = self.new_commit_id headers, linenum = self._parse_extended_headers(linenum) for line in self.lines[start_linenum:linenum]: m = GitDiffParser.FILE_MODE_RE.search(line) if m: mode = int(m.group('mode'), 8) if stat.S_ISLNK(mode): file_info.is_symlink = True break if self._is_new_file(headers): file_info.append_data(headers[b'new file mode'][1]) file_info.origInfo = PRE_CREATION elif self._is_deleted_file(headers): file_info.append_data(headers[b'deleted file mode'][1]) file_info.deleted = True elif self._is_mode_change(headers): file_info.append_data(headers[b'old mode'][1]) file_info.append_data(headers[b'new mode'][1]) if self._is_moved_file(headers): file_info.origFile = headers[b'rename from'][0] file_info.newFile = headers[b'rename to'][0] file_info.moved = True if b'similarity index' in headers: file_info.append_data(headers[b'similarity index'][1]) file_info.append_data(headers[b'rename from'][1]) file_info.append_data(headers[b'rename to'][1]) elif self._is_copied_file(headers): file_info.origFile = headers[b'copy from'][0] file_info.newFile = headers[b'copy to'][0] file_info.copied = True if b'similarity index' in headers: file_info.append_data(headers[b'similarity index'][1]) file_info.append_data(headers[b'copy from'][1]) file_info.append_data(headers[b'copy to'][1]) # Assume by default that the change is empty. If we find content # later, we'll clear this. empty_change = True if b'index' in headers: index_range = headers[b'index'][0].split()[0] if '..' in index_range: file_info.origInfo, file_info.newInfo = index_range.split("..") if self.pre_creation_regexp.match(file_info.origInfo): file_info.origInfo = PRE_CREATION file_info.append_data(headers[b'index'][1]) # Get the changes while linenum < len(self.lines): if self._is_git_diff(linenum): break elif self._is_binary_patch(linenum): file_info.binary = True file_info.append_data(self.lines[linenum]) file_info.append_data(b'\n') empty_change = False linenum += 1 break elif self._is_diff_fromfile_line(linenum): orig_line = self.lines[linenum] new_line = self.lines[linenum + 1] orig_filename = orig_line[len(b'--- '):] new_filename = new_line[len(b'+++ '):] # Some diffs may incorrectly contain filenames listed as: # # --- filename\t # +++ filename\t # # We need to strip those single trailing tabs. if orig_filename.endswith(b'\t'): orig_filename = orig_filename[:-1] if new_filename.endswith(b'\t'): new_filename = new_filename[:-1] # Strip the Git a/ and b/ prefixes, if set in the diff. if orig_filename.startswith(b'a/'): orig_filename = orig_filename[2:] if new_filename.startswith(b'b/'): new_filename = new_filename[2:] if orig_filename == b'/dev/null': file_info.origInfo = PRE_CREATION file_info.origFile = new_filename else: file_info.origFile = orig_filename if new_filename == b'/dev/null': file_info.newFile = orig_filename else: file_info.newFile = new_filename file_info.append_data(orig_line) file_info.append_data(b'\n') file_info.append_data(new_line) file_info.append_data(b'\n') linenum += 2 else: empty_change = False linenum = self.parse_diff_line(linenum, file_info) if not file_info.origFile: # This file didn't have any --- or +++ lines. This usually means # the file was deleted or moved without changes. We'll need to # fall back to parsing the diff --git line, which is more # error-prone. assert not file_info.newFile self._parse_diff_git_line(diff_git_line, file_info, linenum) if isinstance(file_info.origFile, six.binary_type): file_info.origFile = file_info.origFile.decode('utf-8') if isinstance(file_info.newFile, six.binary_type): file_info.newFile = file_info.newFile.decode('utf-8') # For an empty change, we keep the file's info only if it is a new # 0-length file, a moved file, a copied file, or a deleted 0-length # file. if (empty_change and file_info.origInfo != PRE_CREATION and not (file_info.moved or file_info.copied or file_info.deleted)): # We didn't find any interesting content, so leave out this # file's info. # # Note that we may want to change this in the future to preserve # data like mode changes, but that will require filtering out # empty changes at the diff viewer level in a sane way. file_info = None return linenum, file_info