def __init__(self, repo: 'Repo', a_rawpath: Optional[bytes], b_rawpath: Optional[bytes], a_blob_id: Union[str, bytes, None], b_blob_id: Union[str, bytes, None], a_mode: Union[bytes, str, None], b_mode: Union[bytes, str, None], new_file: bool, deleted_file: bool, copied_file: bool, raw_rename_from: Optional[bytes], raw_rename_to: Optional[bytes], diff: Union[str, bytes, None], change_type: Optional[Lit_change_type], score: Optional[int]) -> None: assert a_rawpath is None or isinstance(a_rawpath, bytes) assert b_rawpath is None or isinstance(b_rawpath, bytes) self.a_rawpath = a_rawpath self.b_rawpath = b_rawpath self.a_mode = mode_str_to_int(a_mode) if a_mode else None self.b_mode = mode_str_to_int(b_mode) if b_mode else None # Determine whether this diff references a submodule, if it does then # we need to overwrite "repo" to the corresponding submodule's repo instead if repo and a_rawpath: for submodule in repo.submodules: if submodule.path == a_rawpath.decode(defenc, 'replace'): if submodule.module_exists(): repo = submodule.module() break self.a_blob: Union['IndexObject', None] if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA: self.a_blob = None else: self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path) self.b_blob: Union['IndexObject', None] if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA: self.b_blob = None else: self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path) self.new_file: bool = new_file self.deleted_file: bool = deleted_file self.copied_file: bool = copied_file # be clear and use None instead of empty strings assert raw_rename_from is None or isinstance(raw_rename_from, bytes) assert raw_rename_to is None or isinstance(raw_rename_to, bytes) self.raw_rename_from = raw_rename_from or None self.raw_rename_to = raw_rename_to or None self.diff = diff self.change_type: Union[Lit_change_type, None] = change_type self.score = score
def partial_to_complete_sha_hex(self, partial_hexsha): len_partial_hexsha = len(partial_hexsha) if len_partial_hexsha % 2 != 0: partial_binsha = hex_to_bin(partial_hexsha + "0") else: partial_binsha = hex_to_bin(partial_hexsha) # END assure successful binary conversion candidate = None for db in self._dbs: full_bin_sha = None try: if hasattr(db, 'partial_to_complete_sha_hex'): full_bin_sha = db.partial_to_complete_sha_hex(partial_hexsha) else: full_bin_sha = db.partial_to_complete_sha(partial_binsha, len_partial_hexsha) # END handle database type except BadObject: continue # END ignore bad objects if full_bin_sha: if candidate and candidate != full_bin_sha: raise AmbiguousObjectName(partial_hexsha) candidate = full_bin_sha # END handle candidate # END for each db if not candidate: raise BadObject(partial_binsha) return candidate
def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode, b_mode, new_file, deleted_file, rename_from, rename_to, diff): self.a_mode = a_mode self.b_mode = b_mode if self.a_mode: self.a_mode = mode_str_to_int(self.a_mode) if self.b_mode: self.b_mode = mode_str_to_int(self.b_mode) if a_blob_id is None: self.a_blob = None else: self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=a_path) if b_blob_id is None: self.b_blob = None else: self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=b_path) self.new_file = new_file self.deleted_file = deleted_file # be clear and use None instead of empty strings self.rename_from = rename_from or None self.rename_to = rename_to or None self.diff = diff
def test_reading(self): gdb = PureGitODB(os.path.join(rorepo_dir(), 'objects')) # we have packs and loose objects, alternates doesn't necessarily exist assert 1 < len(gdb.databases()) < 4 # access should be possible git_sha = hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") assert isinstance(gdb.info(git_sha), OInfo) assert isinstance(gdb.stream(git_sha), OStream) assert gdb.size() > 200 sha_list = list(gdb.sha_iter()) assert len(sha_list) == gdb.size() # This is actually a test for compound functionality, but it doesn't # have a separate test module # test partial shas # this one as uneven and quite short assert gdb.partial_to_complete_sha_hex('5aebcd') == hex_to_bin( "5aebcd5cb3340fb31776941d7e4d518a712a8655") # mix even/uneven hexshas for i, binsha in enumerate(sha_list[:50]): assert gdb.partial_to_complete_sha_hex( bin_to_hex(binsha)[:8 - (i % 2)]) == binsha # END for each sha self.failUnlessRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000")
def test_reading(self): gdb = PureGitODB(os.path.join(rorepo_dir(), 'objects')) # we have packs and loose objects, alternates doesn't necessarily exist assert 1 < len(gdb.databases()) < 4 # access should be possible git_sha = hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") assert isinstance(gdb.info(git_sha), OInfo) assert isinstance(gdb.stream(git_sha), OStream) assert gdb.size() > 200 sha_list = list(gdb.sha_iter()) assert len(sha_list) == gdb.size() # This is actually a test for compound functionality, but it doesn't # have a separate test module # test partial shas # this one as uneven and quite short assert gdb.partial_to_complete_sha_hex('5aebcd') == hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") # mix even/uneven hexshas for i, binsha in enumerate(sha_list[:50]): assert gdb.partial_to_complete_sha_hex(bin_to_hex(binsha)[:8-(i%2)]) == binsha # END for each sha self.failUnlessRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000")
def partial_to_complete_sha_hex(self, partial_hexsha): len_partial_hexsha = len(partial_hexsha) if len_partial_hexsha % 2 != 0: partial_binsha = hex_to_bin(partial_hexsha + "0") else: partial_binsha = hex_to_bin(partial_hexsha) # END assure successful binary conversion return self.partial_to_complete_sha(partial_binsha, len(partial_hexsha))
def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" readline = stream.readline self.tree = Tree(self.odb, hex_to_bin(readline().split()[1]), Tree.tree_id << 12, '') self.parents = list() next_line = None while True: parent_line = readline() if not parent_line.startswith('parent'): next_line = parent_line break # END abort reading parents self.parents.append(type(self)(self.odb, hex_to_bin(parent_line.split()[-1]))) # END for each parent line self.parents = tuple(self.parents) self.author, self.authored_date, self.author_tz_offset = parse_actor_and_date(next_line) self.committer, self.committed_date, self.committer_tz_offset = parse_actor_and_date(readline()) # now we can have the encoding line, or an empty line followed by the optional # message. self.encoding = self.default_encoding # read encoding or empty line to separate message enc = readline() enc = enc.strip() if enc: self.encoding = enc[enc.find(' ') + 1:] # now comes the message separator readline() # END handle encoding # decode the authors name try: self.author.name = self.author.name.decode(self.encoding) except UnicodeDecodeError: print >> sys.stderr, "Failed to decode author name '%s' using encoding %s" % ( self.author.name, self.encoding) # END handle author's encoding # decode committer name try: self.committer.name = self.committer.name.decode(self.encoding) except UnicodeDecodeError: print >> sys.stderr, "Failed to decode committer name '%s' using encoding %s" % ( self.committer.name, self.encoding) # END handle author's encoding # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip self.message = stream.read() try: self.message = self.message.decode(self.encoding) except UnicodeDecodeError: print >> sys.stderr, "Failed to decode message '%s' using encoding %s" % (self.message, self.encoding) # END exception handling return self
def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" readline = stream.readline self.tree = Tree(self.odb, hex_to_bin(readline().split()[1]), Tree.tree_id<<12, '') self.parents = list() next_line = None while True: parent_line = readline() if not parent_line.startswith('parent'): next_line = parent_line break # END abort reading parents self.parents.append(type(self)(self.odb, hex_to_bin(parent_line.split()[-1]))) # END for each parent line self.parents = tuple(self.parents) self.author, self.authored_date, self.author_tz_offset = parse_actor_and_date(next_line) self.committer, self.committed_date, self.committer_tz_offset = parse_actor_and_date(readline()) # now we can have the encoding line, or an empty line followed by the optional # message. self.encoding = self.default_encoding # read encoding or empty line to separate message enc = readline() enc = enc.strip() if enc: self.encoding = enc[enc.find(' ')+1:] # now comes the message separator readline() # END handle encoding # decode the authors name try: self.author.name = self.author.name.decode(self.encoding) except UnicodeDecodeError: print >> sys.stderr, "Failed to decode author name '%s' using encoding %s" % (self.author.name, self.encoding) # END handle author's encoding # decode committer name try: self.committer.name = self.committer.name.decode(self.encoding) except UnicodeDecodeError: print >> sys.stderr, "Failed to decode committer name '%s' using encoding %s" % (self.committer.name, self.encoding) # END handle author's encoding # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip self.message = stream.read() try: self.message = self.message.decode(self.encoding) except UnicodeDecodeError: print >> sys.stderr, "Failed to decode message '%s' using encoding %s" % (self.message, self.encoding) # END exception handling return self
def __init__(self, repo, a_rawpath, b_rawpath, a_blob_id, b_blob_id, a_mode, b_mode, new_file, deleted_file, copied_file, raw_rename_from, raw_rename_to, diff, change_type, score): self.a_mode = a_mode self.b_mode = b_mode assert a_rawpath is None or isinstance(a_rawpath, binary_type) assert b_rawpath is None or isinstance(b_rawpath, binary_type) self.a_rawpath = a_rawpath self.b_rawpath = b_rawpath if self.a_mode: self.a_mode = mode_str_to_int(self.a_mode) if self.b_mode: self.b_mode = mode_str_to_int(self.b_mode) # Determine whether this diff references a submodule, if it does then # we need to overwrite "repo" to the corresponding submodule's repo instead if repo and a_rawpath: for submodule in repo.submodules: if submodule.path == a_rawpath.decode("utf-8"): if submodule.module_exists(): repo = submodule.module() break if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA: self.a_blob = None else: self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path) if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA: self.b_blob = None else: self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path) self.new_file = new_file self.deleted_file = deleted_file self.copied_file = copied_file # be clear and use None instead of empty strings assert raw_rename_from is None or isinstance(raw_rename_from, binary_type) assert raw_rename_to is None or isinstance(raw_rename_to, binary_type) self.raw_rename_from = raw_rename_from or None self.raw_rename_to = raw_rename_to or None self.diff = diff self.change_type = change_type self.score = score
def test_decompress_reader_special_case(self): odb = PureLooseObjectODB(fixture_path('objects')) ostream = odb.stream(hex_to_bin('7bb839852ed5e3a069966281bb08d50012fb309b')) # if there is a bug, we will be missing one byte exactly ! data = ostream.read() assert len(data) == ostream.size
def _set_cache_(self, attr): """Cache all our attributes at once""" if attr in TagObject.__slots__: ostream = self.odb.stream(self.binsha) lines = ostream.read().splitlines() obj, hexsha = lines[0].split(" ") # object <hexsha> type_token, type_name = lines[1].split(" ") # type <type_name> self.object = get_object_type_by_name(type_name)(self.odb, hex_to_bin(hexsha)) self.tag = lines[2][4:] # tag <tag name> tagger_info = lines[3][7:] # tagger <actor> <date> self.tagger, self.tagged_date, self.tagger_tz_offset = parse_actor_and_date(tagger_info) # line 4 empty - it could mark the beginning of the next header # in case there really is no message, it would not exist. Otherwise # a newline separates header from message if len(lines) > 5: self.message = "\n".join(lines[5:]) else: self.message = '' # END check our attributes else: super(TagObject, self)._set_cache_(attr)
def _iter_from_process_or_stream(cls, odb, proc_or_stream): """Parse out commit information into a list of Commit objects We expect one-line per commit, and parse the actual commit information directly from our lighting fast object database :param proc: git-rev-list process instance - one sha per line :return: iterator returning Commit objects""" stream = proc_or_stream close_std_err = False if not hasattr(stream, 'readline'): stream = proc_or_stream.stdout close_std_err = True readline = stream.readline try: while True: line = readline() if not line: break hexsha = line.strip() if len(hexsha) > 40: # split additional information, as returned by bisect for instance hexsha, rest = line.split(None, 1) # END handle extra info assert len(hexsha) == 40, "Invalid line: %s" % hexsha yield cls(odb, hex_to_bin(hexsha)) # END for each line in stream finally: stream.close() if close_std_err: proc_or_stream.stderr.close()
def _set_cache_(self, attr): """Cache all our attributes at once""" if attr in TagObject.__slots__: ostream = self.odb.stream(self.binsha) lines = ostream.read().splitlines() obj, hexsha = lines[0].split(" ") # object <hexsha> type_token, type_name = lines[1].split(" ") # type <type_name> self.object = get_object_type_by_name(type_name)(self.odb, hex_to_bin(hexsha)) self.tag = lines[2][4:] # tag <tag name> tagger_info = lines[3][7:]# tagger <actor> <date> self.tagger, self.tagged_date, self.tagger_tz_offset = parse_actor_and_date(tagger_info) # line 4 empty - it could mark the beginning of the next header # in case there really is no message, it would not exist. Otherwise # a newline separates header from message if len(lines) > 5: self.message = "\n".join(lines[5:]) else: self.message = '' # END check our attributes else: super(TagObject, self)._set_cache_(attr)
def store(self, istream): # TODO: remove this check once the required functionality was merged in pygit2 if hasattr(self._py2_repo, 'write'): istream.binsha = hex_to_bin(self._py2_repo.write(type_to_type_id_map[istream.type], istream.read())) return istream else: return super(Pygit2GitODB, self).store(istream)
def _iter_from_process_or_stream(cls, repo, proc_or_stream): """Parse out commit information into a list of Commit objects We expect one-line per commit, and parse the actual commit information directly from our lighting fast object database :param proc: git-rev-list process instance - one sha per line :return: iterator returning Commit objects""" stream = proc_or_stream if not hasattr(stream, 'readline'): stream = proc_or_stream.stdout readline = stream.readline while True: line = readline() if not line: break hexsha = line.strip() if len(hexsha) > 40: # split additional information, as returned by bisect for instance hexsha, _ = line.split(None, 1) # END handle extra info assert len(hexsha) == 40, "Invalid line: %s" % hexsha yield Commit(repo, hex_to_bin(hexsha)) # END for each line in stream # TODO: Review this - it seems process handling got a bit out of control # due to many developers trying to fix the open file handles issue if hasattr(proc_or_stream, 'wait'): finalize_process(proc_or_stream)
def test_index_merge_tree(self, rw_repo): # A bit out of place, but we need a different repo for this: self.assertNotEqual(self.rorepo, rw_repo) self.assertEqual(len(set((self.rorepo, self.rorepo, rw_repo, rw_repo))), 2) # SINGLE TREE MERGE # current index is at the (virtual) cur_commit next_commit = "4c39f9da792792d4e73fc3a5effde66576ae128c" parent_commit = rw_repo.head.commit.parents[0] manifest_key = IndexFile.entry_key('MANIFEST.in', 0) manifest_entry = rw_repo.index.entries[manifest_key] rw_repo.index.merge_tree(next_commit) # only one change should be recorded assert manifest_entry.binsha != rw_repo.index.entries[manifest_key].binsha rw_repo.index.reset(rw_repo.head) self.assertEqual(rw_repo.index.entries[manifest_key].binsha, manifest_entry.binsha) # FAKE MERGE ############# # Add a change with a NULL sha that should conflict with next_commit. We # pretend there was a change, but we do not even bother adding a proper # sha for it ( which makes things faster of course ) manifest_fake_entry = BaseIndexEntry((manifest_entry[0], b"\0" * 20, 0, manifest_entry[3])) # try write flag self._assert_entries(rw_repo.index.add([manifest_fake_entry], write=False)) # add actually resolves the null-hex-sha for us as a feature, but we can # edit the index manually assert rw_repo.index.entries[manifest_key].binsha != Object.NULL_BIN_SHA # must operate on the same index for this ! Its a bit problematic as # it might confuse people index = rw_repo.index index.entries[manifest_key] = IndexEntry.from_base(manifest_fake_entry) index.write() self.assertEqual(rw_repo.index.entries[manifest_key].hexsha, Diff.NULL_HEX_SHA) # write an unchanged index ( just for the fun of it ) rw_repo.index.write() # a three way merge would result in a conflict and fails as the command will # not overwrite any entries in our index and hence leave them unmerged. This is # mainly a protection feature as the current index is not yet in a tree self.failUnlessRaises(GitCommandError, index.merge_tree, next_commit, base=parent_commit) # the only way to get the merged entries is to safe the current index away into a tree, # which is like a temporary commit for us. This fails as well as the NULL sha deos not # have a corresponding object # NOTE: missing_ok is not a kwarg anymore, missing_ok is always true # self.failUnlessRaises(GitCommandError, index.write_tree) # if missing objects are okay, this would work though ( they are always okay now ) # As we can't read back the tree with NULL_SHA, we rather set it to something else index.entries[manifest_key] = IndexEntry(manifest_entry[:1] + (hex_to_bin('f' * 40),) + manifest_entry[2:]) tree = index.write_tree() # now make a proper three way merge with unmerged entries unmerged_tree = IndexFile.from_tree(rw_repo, parent_commit, tree, next_commit) unmerged_blobs = unmerged_tree.unmerged_blobs() self.assertEqual(len(unmerged_blobs), 1) self.assertEqual(list(unmerged_blobs.keys())[0], manifest_key[0])
def test_writing(self, path): NULL_BIN_SHA = '\0' * 20 alt_path = os.path.join(path, 'alternates') rdb = PureReferenceDB(alt_path) assert len(rdb.databases()) == 0 assert rdb.size() == 0 assert len(list(rdb.sha_iter())) == 0 # try empty, non-existing assert not rdb.has_object(NULL_BIN_SHA) # setup alternate file # add two, one is invalid own_repo_path = fixture_path('../../../.git/objects') # use own repo self.make_alt_file(alt_path, [own_repo_path, "invalid/path"]) rdb.update_cache() assert len(rdb.databases()) == 1 # we should now find a default revision of ours git_sha = hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") assert rdb.has_object(git_sha) # remove valid self.make_alt_file(alt_path, ["just/one/invalid/path"]) rdb.update_cache() assert len(rdb.databases()) == 0 # add valid self.make_alt_file(alt_path, [own_repo_path]) rdb.update_cache() assert len(rdb.databases()) == 1
def _get_object(self): """ :return: The object our ref currently refers to. Refs can be cached, they will always point to the actual object as it gets re-created on each query""" # have to be dynamic here as we may be a tag which can point to anything # Our path will be resolved to the hexsha which will be used accordingly return Object.new_from_sha(self.repo, hex_to_bin(self.dereference_recursive(self.repo, self.path)))
def test_decompress_reader_special_case(self): odb = PureLooseObjectODB(fixture_path('objects')) ostream = odb.stream( hex_to_bin('7bb839852ed5e3a069966281bb08d50012fb309b')) # if there is a bug, we will be missing one byte exactly ! data = ostream.read() assert len(data) == ostream.size
def __init__(self, repo, a_rawpath, b_rawpath, a_blob_id, b_blob_id, a_mode, b_mode, new_file, deleted_file, raw_rename_from, raw_rename_to, diff, change_type, score): self.a_mode = a_mode self.b_mode = b_mode assert a_rawpath is None or isinstance(a_rawpath, binary_type) assert b_rawpath is None or isinstance(b_rawpath, binary_type) self.a_rawpath = a_rawpath self.b_rawpath = b_rawpath if self.a_mode: self.a_mode = mode_str_to_int(self.a_mode) if self.b_mode: self.b_mode = mode_str_to_int(self.b_mode) if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA: self.a_blob = None else: self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path) if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA: self.b_blob = None else: self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path) self.new_file = new_file self.deleted_file = deleted_file # be clear and use None instead of empty strings assert raw_rename_from is None or isinstance(raw_rename_from, binary_type) assert raw_rename_to is None or isinstance(raw_rename_to, binary_type) self.raw_rename_from = raw_rename_from or None self.raw_rename_to = raw_rename_to or None self.diff = diff self.change_type = change_type self.score = score
def stream(self, sha): """For now, all lookup is done by git itself :note: As we don't know when the stream is actually read (and if it is stored for later use) we read the data rigth away and cache it. This has HUGE performance implication, both for memory as for reading/deserializing objects, but we have no other choice in order to make the database behaviour consistent with other implementations !""" hexsha, typename, size, data = self._git.get_object_data(bin_to_hex(sha)) return OStream(hex_to_bin(hexsha), typename, size, StringIO(data))
def partial_to_complete_sha_hex(self, partial_hexsha): """:return: Full binary 20 byte sha from the given partial hexsha :raise AmbiguousObjectName: :raise BadObject: :note: currently we only raise BadObject as git does not communicate AmbiguousObjects separately""" try: hexsha, typename, size = self._git.get_object_header(partial_hexsha) return hex_to_bin(hexsha) except (GitCommandError, ValueError): raise BadObject(partial_hexsha)
def partial_to_complete_sha_hex(self, partial_hexsha: str) -> bytes: """:return: Full binary 20 byte sha from the given partial hexsha :raise AmbiguousObjectName: :raise BadObject: :note: currently we only raise BadObject as git does not communicate AmbiguousObjects separately""" try: hexsha, _typename, _size = self._git.get_object_header(partial_hexsha) return hex_to_bin(hexsha) except (GitCommandError, ValueError) as e: raise BadObject(partial_hexsha) from e
def sha_iter(self): # find all files which look like an object, extract sha from there for root, dirs, files in os.walk(self.root_path()): root_base = basename(root) if len(root_base) != 2: continue for f in files: if len(f) != 38: continue yield hex_to_bin(root_base + f)
def name_to_object( repo: 'Repo', name: str, return_ref: bool = False ) -> Union[SymbolicReference, 'Commit', 'TagObject', 'Blob', 'Tree']: """ :return: object specified by the given name, hexshas ( short and long ) as well as references are supported :param return_ref: if name specifies a reference, we will return the reference instead of the object. Otherwise it will raise BadObject or BadName """ hexsha: Union[None, str, bytes] = None # is it a hexsha ? Try the most common ones, which is 7 to 40 if repo.re_hexsha_shortened.match(name): if len(name) != 40: # find long sha for short sha hexsha = short_to_long(repo.odb, name) else: hexsha = name # END handle short shas # END find sha if it matches # if we couldn't find an object for what seemed to be a short hexsha # try to find it as reference anyway, it could be named 'aaa' for instance if hexsha is None: for base in ('%s', 'refs/%s', 'refs/tags/%s', 'refs/heads/%s', 'refs/remotes/%s', 'refs/remotes/%s/HEAD'): try: hexsha = SymbolicReference.dereference_recursive( repo, base % name) if return_ref: return SymbolicReference(repo, base % name) # END handle symbolic ref break except ValueError: pass # END for each base # END handle hexsha # didn't find any ref, this is an error if return_ref: raise BadObject("Couldn't find reference named %r" % name) # END handle return ref # tried everything ? fail if hexsha is None: raise BadName(name) # END assert hexsha was found return Object.new_from_sha(repo, hex_to_bin(hexsha))
def __init__(self, repo, a_rawpath, b_rawpath, a_blob_id, b_blob_id, a_mode, b_mode, new_file, deleted_file, raw_rename_from, raw_rename_to, diff, change_type): self.a_mode = a_mode self.b_mode = b_mode assert a_rawpath is None or isinstance(a_rawpath, binary_type) assert b_rawpath is None or isinstance(b_rawpath, binary_type) self.a_rawpath = a_rawpath self.b_rawpath = b_rawpath if self.a_mode: self.a_mode = mode_str_to_int(self.a_mode) if self.b_mode: self.b_mode = mode_str_to_int(self.b_mode) if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA: self.a_blob = None else: self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path) if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA: self.b_blob = None else: self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path) self.new_file = new_file self.deleted_file = deleted_file # be clear and use None instead of empty strings assert raw_rename_from is None or isinstance(raw_rename_from, binary_type) assert raw_rename_to is None or isinstance(raw_rename_to, binary_type) self.raw_rename_from = raw_rename_from or None self.raw_rename_to = raw_rename_to or None self.diff = diff self.change_type = change_type
def test_base_object(self): # test interface of base object classes types = (Blob, Tree, Commit, TagObject) self.assertEqual(len(types), len(self.type_tuples)) s = set() num_objs = 0 num_index_objs = 0 for obj_type, (typename, hexsha, path) in zip(types, self.type_tuples): binsha = hex_to_bin(hexsha) item = None if path is None: item = obj_type(self.rorepo, binsha) else: item = obj_type(self.rorepo, binsha, 0, path) # END handle index objects num_objs += 1 self.assertEqual(item.hexsha, hexsha) self.assertEqual(item.type, typename) assert item.size self.assertEqual(item, item) self.assertNotEqual(not item, item) self.assertEqual(str(item), item.hexsha) assert repr(item) s.add(item) if isinstance(item, base.IndexObject): num_index_objs += 1 if hasattr(item, 'path'): # never runs here assert not item.path.startswith("/") # must be relative assert isinstance(item.mode, int) # END index object check # read from stream data_stream = item.data_stream data = data_stream.read() assert data tmpfilename = tempfile.mktemp(suffix='test-stream') with open(tmpfilename, 'wb+') as tmpfile: self.assertEqual(item, item.stream_data(tmpfile)) tmpfile.seek(0) self.assertEqual(tmpfile.read(), data) os.remove(tmpfilename) # END for each object type to create # each has a unique sha self.assertEqual(len(s), num_objs) self.assertEqual(len(s | s), num_objs) self.assertEqual(num_index_objs, 2)
def test_base_object(self): # test interface of base object classes types = (Blob, Tree, Commit, TagObject) assert len(types) == len(self.type_tuples) s = set() num_objs = 0 num_index_objs = 0 for obj_type, (typename, hexsha, path) in zip(types, self.type_tuples): binsha = hex_to_bin(hexsha) item = None if path is None: item = obj_type(self.rorepo, binsha) else: item = obj_type(self.rorepo, binsha, 0, path) # END handle index objects num_objs += 1 assert item.hexsha == hexsha assert item.type == typename assert item.size assert item == item assert not item != item assert str(item) == item.hexsha assert repr(item) s.add(item) if isinstance(item, base.IndexObject): num_index_objs += 1 if hasattr(item, 'path'): # never runs here assert not item.path.startswith("/") # must be relative assert isinstance(item.mode, int) # END index object check # read from stream data_stream = item.data_stream data = data_stream.read() assert data tmpfile = os.tmpfile() assert item == item.stream_data(tmpfile) tmpfile.seek(0) assert tmpfile.read() == data # END stream to file directly # END for each object type to create # each has a unique sha assert len(s) == num_objs assert len(s | s) == num_objs assert num_index_objs == 2
def extract_options(repo, blob_id): from git.util import hex_to_bin blob = git.Blob(repo, hex_to_bin(blob_id)) content = blob.data_stream.stream options = set() for line in iter(content.readline, b''): line = line.decode('utf-8').strip() if re_option.match(line): line = content.readline().decode('utf-8').strip() options.add(line.split(';')[0]) return options
def name_to_object(repo, name, return_ref=False): """ :return: object specified by the given name, hexshas ( short and long ) as well as references are supported :param return_ref: if name specifies a reference, we will return the reference instead of the object. Otherwise it will raise BadObject """ hexsha = None # is it a hexsha ? Try the most common ones, which is 7 to 40 if repo.re_hexsha_shortened.match(name): if len(name) != 40: # find long sha for short sha hexsha = short_to_long(repo.odb, name) else: hexsha = name # END handle short shas #END find sha if it matches # if we couldn't find an object for what seemed to be a short hexsha # try to find it as reference anyway, it could be named 'aaa' for instance if hexsha is None: for base in ('%s', 'refs/%s', 'refs/tags/%s', 'refs/heads/%s', 'refs/remotes/%s', 'refs/remotes/%s/HEAD'): try: hexsha = SymbolicReference.dereference_recursive(repo, base % name) if return_ref: return SymbolicReference(repo, base % name) #END handle symbolic ref break except ValueError: pass # END for each base # END handle hexsha # didn't find any ref, this is an error if return_ref: raise BadObject("Couldn't find reference named %r" % name) #END handle return ref # tried everything ? fail if hexsha is None: raise BadObject(name) # END assert hexsha was found return Object.new_from_sha(repo, hex_to_bin(hexsha))
def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" readline = stream.readline self.tree = Tree(self.repo, hex_to_bin(readline().split()[1]), Tree.tree_id << 12, '') self.parents = [] next_line = None while True: parent_line = readline() if not parent_line.startswith(b'parent'): next_line = parent_line break # END abort reading parents self.parents.append( type(self)(self.repo, hex_to_bin( parent_line.split()[-1].decode('ascii')))) # END for each parent line self.parents = tuple(self.parents) # we don't know actual author encoding before we have parsed it, so keep the lines around author_line = next_line committer_line = readline() # we might run into one or more mergetag blocks, skip those for now next_line = readline() while next_line.startswith(b'mergetag '): next_line = readline() while next_line.startswith(b' '): next_line = readline() # end skip mergetags # now we can have the encoding line, or an empty line followed by the optional # message. self.encoding = self.default_encoding self.gpgsig = None # read headers enc = next_line buf = enc.strip() while buf: if buf[0:10] == b"encoding ": self.encoding = buf[buf.find(' ') + 1:].decode( self.encoding, 'ignore') elif buf[0:7] == b"gpgsig ": sig = buf[buf.find(b' ') + 1:] + b"\n" is_next_header = False while True: sigbuf = readline() if not sigbuf: break if sigbuf[0:1] != b" ": buf = sigbuf.strip() is_next_header = True break sig += sigbuf[1:] # end read all signature self.gpgsig = sig.rstrip(b"\n").decode(self.encoding, 'ignore') if is_next_header: continue buf = readline().strip() # decode the authors name try: self.author, self.authored_date, self.author_tz_offset = \ parse_actor_and_date(author_line.decode(self.encoding, 'replace')) except UnicodeDecodeError: log.error("Failed to decode author line '%s' using encoding %s", author_line, self.encoding, exc_info=True) try: self.committer, self.committed_date, self.committer_tz_offset = \ parse_actor_and_date(committer_line.decode(self.encoding, 'replace')) except UnicodeDecodeError: log.error("Failed to decode committer line '%s' using encoding %s", committer_line, self.encoding, exc_info=True) # END handle author's encoding # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip self.message = stream.read() try: self.message = self.message.decode(self.encoding, 'replace') except UnicodeDecodeError: log.error("Failed to decode message '%s' using encoding %s", self.message, self.encoding, exc_info=True) # END exception handling return self
def blame(self, rev, file, incremental=False, **kwargs): """The blame information for the given file at the given revision. :parm rev: revision specifier, see git-rev-parse for viable options. :return: list: [git.Commit, list: [<line>]] A list of tuples associating a Commit object with a list of lines that changed within the given commit. The Commit objects will be given in order of appearance.""" if incremental: return self.blame_incremental(rev, file, **kwargs) data = self.git.blame(rev, '--', file, p=True, stdout_as_string=False, **kwargs) commits = dict() blames = list() info = None keepends = True for line in data.splitlines(keepends): try: line = line.rstrip().decode(defenc) except UnicodeDecodeError: firstpart = '' is_binary = True else: # As we don't have an idea when the binary data ends, as it could contain multiple newlines # in the process. So we rely on being able to decode to tell us what is is. # This can absolutely fail even on text files, but even if it does, we should be fine treating it # as binary instead parts = self.re_whitespace.split(line, 1) firstpart = parts[0] is_binary = False # end handle decode of line if self.re_hexsha_only.search(firstpart): # handles # 634396b2f541a9f2d58b00be1a07f0c358b999b3 1 1 7 - indicates blame-data start # 634396b2f541a9f2d58b00be1a07f0c358b999b3 2 2 - indicates # another line of blame with the same data digits = parts[-1].split(" ") if len(digits) == 3: info = {'id': firstpart} blames.append([None, []]) elif info['id'] != firstpart: info = {'id': firstpart} blames.append([commits.get(firstpart), []]) # END blame data initialization else: m = self.re_author_committer_start.search(firstpart) if m: # handles: # author Tom Preston-Werner # author-mail <*****@*****.**> # author-time 1192271832 # author-tz -0700 # committer Tom Preston-Werner # committer-mail <*****@*****.**> # committer-time 1192271832 # committer-tz -0700 - IGNORED BY US role = m.group(0) if firstpart.endswith('-mail'): info["%s_email" % role] = parts[-1] elif firstpart.endswith('-time'): info["%s_date" % role] = int(parts[-1]) elif role == firstpart: info[role] = parts[-1] # END distinguish mail,time,name else: # handle # filename lib/grit.rb # summary add Blob # <and rest> if firstpart.startswith('filename'): info['filename'] = parts[-1] elif firstpart.startswith('summary'): info['summary'] = parts[-1] elif firstpart == '': if info: sha = info['id'] c = commits.get(sha) if c is None: c = Commit( self, hex_to_bin(sha), author=Actor._from_string( info['author'] + ' ' + info['author_email']), authored_date=info['author_date'], committer=Actor._from_string( info['committer'] + ' ' + info['committer_email']), committed_date=info['committer_date']) commits[sha] = c # END if commit objects needs initial creation if not is_binary: if line and line[0] == '\t': line = line[1:] else: # NOTE: We are actually parsing lines out of binary data, which can lead to the # binary being split up along the newline separator. We will append this to the blame # we are currently looking at, even though it should be concatenated with the last line # we have seen. pass # end handle line contents blames[-1][0] = c blames[-1][1].append(line) info = {'id': sha} # END if we collected commit info # END distinguish filename,summary,rest # END distinguish author|committer vs filename,summary,rest # END distinguish hexsha vs other information return blames
def blame_incremental(self, rev, file, **kwargs): """Iterator for blame information for the given file at the given revision. Unlike .blame(), this does not return the actual file's contents, only a stream of BlameEntry tuples. :parm rev: revision specifier, see git-rev-parse for viable options. :return: lazy iterator of BlameEntry tuples, where the commit indicates the commit to blame for the line, and range indicates a span of line numbers in the resulting file. If you combine all line number ranges outputted by this command, you should get a continuous range spanning all line numbers in the file. """ data = self.git.blame(rev, '--', file, p=True, incremental=True, stdout_as_string=False, **kwargs) commits = dict() stream = (line for line in data.split(b'\n') if line) while True: line = next( stream ) # when exhausted, causes a StopIteration, terminating this function hexsha, orig_lineno, lineno, num_lines = line.split() lineno = int(lineno) num_lines = int(num_lines) orig_lineno = int(orig_lineno) if hexsha not in commits: # Now read the next few lines and build up a dict of properties # for this commit props = dict() while True: line = next(stream) if line == b'boundary': # "boundary" indicates a root commit and occurs # instead of the "previous" tag continue tag, value = line.split(b' ', 1) props[tag] = value if tag == b'filename': # "filename" formally terminates the entry for --incremental orig_filename = value break c = Commit( self, hex_to_bin(hexsha), author=Actor( safe_decode(props[b'author']), safe_decode( props[b'author-mail'].lstrip(b'<').rstrip(b'>'))), authored_date=int(props[b'author-time']), committer=Actor( safe_decode(props[b'committer']), safe_decode(props[b'committer-mail'].lstrip( b'<').rstrip(b'>'))), committed_date=int(props[b'committer-time'])) commits[hexsha] = c else: # Discard all lines until we find "filename" which is # guaranteed to be the last line while True: line = next( stream) # will fail if we reach the EOF unexpectedly tag, value = line.split(b' ', 1) if tag == b'filename': orig_filename = value break yield BlameEntry(commits[hexsha], range(lineno, lineno + num_lines), safe_decode(orig_filename), range(orig_lineno, orig_lineno + num_lines))
def test_base(self): rlp_head = fixture_path('reflog_HEAD') rlp_master = fixture_path('reflog_master') tdir = tempfile.mktemp(suffix="test_reflogs") os.mkdir(tdir) rlp_master_ro = RefLog.path(self.rorepo.head) assert osp.isfile(rlp_master_ro) # simple read reflog = RefLog.from_file(rlp_master_ro) assert reflog._path is not None assert isinstance(reflog, RefLog) assert len(reflog) # iter_entries works with path and with stream assert len(list(RefLog.iter_entries(open(rlp_master, 'rb')))) assert len(list(RefLog.iter_entries(rlp_master))) # raise on invalid revlog # TODO: Try multiple corrupted ones ! pp = 'reflog_invalid_' for suffix in ('oldsha', 'newsha', 'email', 'date', 'sep'): self.assertRaises(ValueError, RefLog.from_file, fixture_path(pp + suffix)) # END for each invalid file # cannot write an uninitialized reflog self.assertRaises(ValueError, RefLog().write) # test serialize and deserialize - results must match exactly binsha = hex_to_bin(('f' * 40).encode('ascii')) msg = "my reflog message" cr = self.rorepo.config_reader() for rlp in (rlp_head, rlp_master): reflog = RefLog.from_file(rlp) tfile = osp.join(tdir, osp.basename(rlp)) reflog.to_file(tfile) assert reflog.write() is reflog # parsed result must match ... treflog = RefLog.from_file(tfile) assert treflog == reflog # ... as well as each bytes of the written stream assert open(tfile).read() == open(rlp).read() # append an entry entry = RefLog.append_entry(cr, tfile, IndexObject.NULL_BIN_SHA, binsha, msg) assert entry.oldhexsha == IndexObject.NULL_HEX_SHA assert entry.newhexsha == 'f' * 40 assert entry.message == msg assert RefLog.from_file(tfile)[-1] == entry # index entry # raises on invalid index self.assertRaises(IndexError, RefLog.entry_at, rlp, 10000) # indices can be positive ... assert isinstance(RefLog.entry_at(rlp, 0), RefLogEntry) RefLog.entry_at(rlp, 23) # ... and negative for idx in (-1, -24): RefLog.entry_at(rlp, idx) # END for each index to read # END for each reflog # finally remove our temporary data rmtree(tdir)
def _get_object_sha(self): """ :return: The binary sha to the object our ref currently refers to. Refs can be cached, they will always point to the actual object as it gets re-created on each query""" return hex_to_bin(self.dereference_recursive(self.repo, self.path))
def rev_parse(repo, rev): """ :return: Object at the given revision, either Commit, Tag, Tree or Blob :param rev: git-rev-parse compatible revision specification as string, please see http://www.kernel.org/pub/software/scm/git/docs/git-rev-parse.html for details :raise BadObject: if the given revision could not be found :raise ValueError: If rev couldn't be parsed :raise IndexError: If invalid reflog index is specified""" # colon search mode ? if rev.startswith(':/'): # colon search mode raise NotImplementedError("commit by message search ( regex )") # END handle search obj = None ref = None output_type = "commit" start = 0 parsed_to = 0 lr = len(rev) while start < lr: if rev[start] not in "^~:@": start += 1 continue # END handle start token = rev[start] if obj is None: # token is a rev name if start == 0: ref = repo.head.ref else: if token == '@': ref = name_to_object(repo, rev[:start], return_ref=True) else: obj = name_to_object(repo, rev[:start]) # END handle token # END handle refname if ref is not None: obj = ref.commit # END handle ref # END initialize obj on first token start += 1 # try to parse {type} if start < lr and rev[start] == '{': end = rev.find('}', start) if end == -1: raise ValueError("Missing closing brace to define type in %s" % rev) output_type = rev[start + 1:end] # exclude brace # handle type if output_type == 'commit': pass # default elif output_type == 'tree': try: obj = to_commit(obj).tree except (AttributeError, ValueError): pass # error raised later # END exception handling elif output_type in ('', 'blob'): if obj.type == 'tag': obj = deref_tag(obj) else: # cannot do anything for non-tags pass # END handle tag elif token == '@': # try single int assert ref is not None, "Requre Reference to access reflog" revlog_index = None try: # transform reversed index into the format of our revlog revlog_index = -(int(output_type) + 1) except ValueError: # TODO: Try to parse the other date options, using parse_date # maybe raise NotImplementedError( "Support for additional @{...} modes not implemented") # END handle revlog index try: entry = ref.log_entry(revlog_index) except IndexError: raise IndexError("Invalid revlog index: %i" % revlog_index) # END handle index out of bound obj = Object.new_from_sha(repo, hex_to_bin(entry.newhexsha)) # make it pass the following checks output_type = None else: raise ValueError("Invalid output type: %s ( in %s )" % (output_type, rev)) # END handle output type # empty output types don't require any specific type, its just about dereferencing tags if output_type and obj.type != output_type: raise ValueError( "Could not accommodate requested object type %r, got %s" % (output_type, obj.type)) # END verify output type start = end + 1 # skip brace parsed_to = start continue # END parse type # try to parse a number num = 0 if token != ":": found_digit = False while start < lr: if rev[start] in digits: num = num * 10 + int(rev[start]) start += 1 found_digit = True else: break # END handle number # END number parse loop # no explicit number given, 1 is the default # It could be 0 though if not found_digit: num = 1 # END set default num # END number parsing only if non-blob mode parsed_to = start # handle hierarchy walk try: if token == "~": obj = to_commit(obj) for _ in xrange(num): obj = obj.parents[0] # END for each history item to walk elif token == "^": obj = to_commit(obj) # must be n'th parent if num: obj = obj.parents[num - 1] elif token == ":": if obj.type != "tree": obj = obj.tree # END get tree type obj = obj[rev[start:]] parsed_to = lr else: raise ValueError("Invalid token: %r" % token) # END end handle tag except (IndexError, AttributeError): raise BadName( "Invalid revision spec '%s' - not enough parent commits to reach '%s%i'" % (rev, token, num)) # END exception handling # END parse loop # still no obj ? Its probably a simple name if obj is None: obj = name_to_object(repo, rev) parsed_to = lr # END handle simple name if obj is None: raise ValueError("Revision specifier could not be parsed: %s" % rev) if parsed_to != lr: raise ValueError( "Didn't consume complete rev spec %s, consumed part: %s" % (rev, rev[:parsed_to])) return obj
def test_base(self): rlp_head = fixture_path('reflog_HEAD') rlp_master = fixture_path('reflog_master') tdir = tempfile.mktemp(suffix="test_reflogs") os.mkdir(tdir) rlp_master_ro = RefLog.path(self.rorepo.head) assert osp.isfile(rlp_master_ro) # simple read reflog = RefLog.from_file(rlp_master_ro) assert reflog._path is not None assert isinstance(reflog, RefLog) assert len(reflog) # iter_entries works with path and with stream assert len(list(RefLog.iter_entries(open(rlp_master, 'rb')))) assert len(list(RefLog.iter_entries(rlp_master))) # raise on invalid revlog # TODO: Try multiple corrupted ones ! pp = 'reflog_invalid_' for suffix in ('oldsha', 'newsha', 'email', 'date', 'sep'): self.failUnlessRaises(ValueError, RefLog.from_file, fixture_path(pp + suffix)) # END for each invalid file # cannot write an uninitialized reflog self.failUnlessRaises(ValueError, RefLog().write) # test serialize and deserialize - results must match exactly binsha = hex_to_bin(('f' * 40).encode('ascii')) msg = "my reflog message" cr = self.rorepo.config_reader() for rlp in (rlp_head, rlp_master): reflog = RefLog.from_file(rlp) tfile = osp.join(tdir, osp.basename(rlp)) reflog.to_file(tfile) assert reflog.write() is reflog # parsed result must match ... treflog = RefLog.from_file(tfile) assert treflog == reflog # ... as well as each bytes of the written stream assert open(tfile).read() == open(rlp).read() # append an entry entry = RefLog.append_entry(cr, tfile, IndexObject.NULL_BIN_SHA, binsha, msg) assert entry.oldhexsha == IndexObject.NULL_HEX_SHA assert entry.newhexsha == 'f' * 40 assert entry.message == msg assert RefLog.from_file(tfile)[-1] == entry # index entry # raises on invalid index self.failUnlessRaises(IndexError, RefLog.entry_at, rlp, 10000) # indices can be positive ... assert isinstance(RefLog.entry_at(rlp, 0), RefLogEntry) RefLog.entry_at(rlp, 23) # ... and negative for idx in (-1, -24): RefLog.entry_at(rlp, idx) # END for each index to read # END for each reflog # finally remove our temporary data rmtree(tdir)
def blame_incremental(self, rev, file, **kwargs): """Iterator for blame information for the given file at the given revision. Unlike .blame(), this does not return the actual file's contents, only a stream of BlameEntry tuples. :parm rev: revision specifier, see git-rev-parse for viable options. :return: lazy iterator of BlameEntry tuples, where the commit indicates the commit to blame for the line, and range indicates a span of line numbers in the resulting file. If you combine all line number ranges outputted by this command, you should get a continuous range spanning all line numbers in the file. """ data = self.git.blame(rev, '--', file, p=True, incremental=True, stdout_as_string=False, **kwargs) commits = dict() stream = (line for line in data.split(b'\n') if line) while True: line = next(stream) # when exhausted, casues a StopIteration, terminating this function hexsha, orig_lineno, lineno, num_lines = line.split() lineno = int(lineno) num_lines = int(num_lines) orig_lineno = int(orig_lineno) if hexsha not in commits: # Now read the next few lines and build up a dict of properties # for this commit props = dict() while True: line = next(stream) if line == b'boundary': # "boundary" indicates a root commit and occurs # instead of the "previous" tag continue tag, value = line.split(b' ', 1) props[tag] = value if tag == b'filename': # "filename" formally terminates the entry for --incremental orig_filename = value break c = Commit(self, hex_to_bin(hexsha), author=Actor(safe_decode(props[b'author']), safe_decode(props[b'author-mail'].lstrip(b'<').rstrip(b'>'))), authored_date=int(props[b'author-time']), committer=Actor(safe_decode(props[b'committer']), safe_decode(props[b'committer-mail'].lstrip(b'<').rstrip(b'>'))), committed_date=int(props[b'committer-time'])) commits[hexsha] = c else: # Discard the next line (it's a filename end tag) line = next(stream) tag, value = line.split(b' ', 1) assert tag == b'filename', 'Unexpected git blame output' orig_filename = value yield BlameEntry(commits[hexsha], range(lineno, lineno + num_lines), safe_decode(orig_filename), range(orig_lineno, orig_lineno + num_lines))
def blame(repo, start_commit, end_commit, filename): data = repo.git.blame('%s^..%s' % (start_commit, end_commit), '--', filename, p=True) commits = dict() blames = list() info = None for line in data.splitlines(False): parts = repo.re_whitespace.split(line, 1) firstpart = parts[0] if repo.re_hexsha_only.search(firstpart): # handles # 634396b2f541a9f2d58b00be1a07f0c358b999b3 1 1 7 - indicates blame-data start # 634396b2f541a9f2d58b00be1a07f0c358b999b3 2 2 digits = parts[-1].split(" ") if len(digits) == 3: info = {'id': firstpart} blames.append([None, []]) elif info['id'] != firstpart: info = {'id': firstpart} blames.append([commits.get(firstpart), []]) # END blame data initialization else: m = repo.re_author_committer_start.search(firstpart) if m: # handles: # author Tom Preston-Werner # author-mail <*****@*****.**> # author-time 1192271832 # author-tz -0700 # committer Tom Preston-Werner # committer-mail <*****@*****.**> # committer-time 1192271832 # committer-tz -0700 - IGNORED BY US role = m.group(0) if firstpart.endswith('-mail'): info["%s_email" % role] = parts[-1] elif firstpart.endswith('-time'): info["%s_date" % role] = int(parts[-1]) elif role == firstpart: info[role] = parts[-1] # END distinguish mail,time,name else: # handle # filename lib/grit.rb # summary add Blob # <and rest> if firstpart.startswith('filename'): info['filename'] = parts[-1] elif firstpart.startswith('summary'): info['summary'] = parts[-1] elif firstpart.startswith('boundary'): info['boundary'] = True elif firstpart == '': if info: sha = info['id'] c = commits.get(sha) if c is None: if info.get('boundary'): commits[sha] = False else: c = repo.CommitCls( repo, hex_to_bin(sha), author=Actor._from_string(info['author'] + ' ' + info['author_email']), authored_date=info['author_date'], committer=Actor._from_string(info['committer'] + ' ' + info['committer_email']), committed_date=info['committer_date'], message=info['summary'] ) commits[sha] = c if c is not False: # END if commit objects needs initial creation m = repo.re_tab_full_line.search(line) text, = m.groups() blames[-1][0] = c blames[-1][1].append(text) info = { 'id' : sha } # END if we collected commit info # END distinguish filename,summary,rest # END distinguish author|committer vs filename,summary,rest # END distinguish hexsha vs other information for commit, lines in blames: if commit is not None: yield commit, lines
def blame(self, rev, file, incremental=False, **kwargs): """The blame information for the given file at the given revision. :parm rev: revision specifier, see git-rev-parse for viable options. :return: list: [git.Commit, list: [<line>]] A list of tuples associating a Commit object with a list of lines that changed within the given commit. The Commit objects will be given in order of appearance.""" if incremental: return self.blame_incremental(rev, file, **kwargs) data = self.git.blame(rev, '--', file, p=True, stdout_as_string=False, **kwargs) commits = dict() blames = list() info = None keepends = True for line in data.splitlines(keepends): try: line = line.rstrip().decode(defenc) except UnicodeDecodeError: firstpart = '' is_binary = True else: # As we don't have an idea when the binary data ends, as it could contain multiple newlines # in the process. So we rely on being able to decode to tell us what is is. # This can absolutely fail even on text files, but even if it does, we should be fine treating it # as binary instead parts = self.re_whitespace.split(line, 1) firstpart = parts[0] is_binary = False # end handle decode of line if self.re_hexsha_only.search(firstpart): # handles # 634396b2f541a9f2d58b00be1a07f0c358b999b3 1 1 7 - indicates blame-data start # 634396b2f541a9f2d58b00be1a07f0c358b999b3 2 2 - indicates # another line of blame with the same data digits = parts[-1].split(" ") if len(digits) == 3: info = {'id': firstpart} blames.append([None, []]) elif info['id'] != firstpart: info = {'id': firstpart} blames.append([commits.get(firstpart), []]) # END blame data initialization else: m = self.re_author_committer_start.search(firstpart) if m: # handles: # author Tom Preston-Werner # author-mail <*****@*****.**> # author-time 1192271832 # author-tz -0700 # committer Tom Preston-Werner # committer-mail <*****@*****.**> # committer-time 1192271832 # committer-tz -0700 - IGNORED BY US role = m.group(0) if firstpart.endswith('-mail'): info["%s_email" % role] = parts[-1] elif firstpart.endswith('-time'): info["%s_date" % role] = int(parts[-1]) elif role == firstpart: info[role] = parts[-1] # END distinguish mail,time,name else: # handle # filename lib/grit.rb # summary add Blob # <and rest> if firstpart.startswith('filename'): info['filename'] = parts[-1] elif firstpart.startswith('summary'): info['summary'] = parts[-1] elif firstpart == '': if info: sha = info['id'] c = commits.get(sha) if c is None: c = Commit(self, hex_to_bin(sha), author=Actor._from_string(info['author'] + ' ' + info['author_email']), authored_date=info['author_date'], committer=Actor._from_string( info['committer'] + ' ' + info['committer_email']), committed_date=info['committer_date']) commits[sha] = c # END if commit objects needs initial creation if not is_binary: if line and line[0] == '\t': line = line[1:] else: # NOTE: We are actually parsing lines out of binary data, which can lead to the # binary being split up along the newline separator. We will append this to the blame # we are currently looking at, even though it should be concatenated with the last line # we have seen. pass # end handle line contents blames[-1][0] = c blames[-1][1].append(line) info = {'id': sha} # END if we collected commit info # END distinguish filename,summary,rest # END distinguish author|committer vs filename,summary,rest # END distinguish hexsha vs other information return blames
def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" readline = stream.readline self.tree = Tree(self.repo, hex_to_bin(readline().split()[1]), Tree.tree_id << 12, '') self.parents = list() next_line = None while True: parent_line = readline() if not parent_line.startswith(b'parent'): next_line = parent_line break # END abort reading parents self.parents.append(type(self)(self.repo, hex_to_bin(parent_line.split()[-1].decode('ascii')))) # END for each parent line self.parents = tuple(self.parents) # we don't know actual author encoding before we have parsed it, so keep the lines around author_line = next_line committer_line = readline() # we might run into one or more mergetag blocks, skip those for now next_line = readline() while next_line.startswith(b'mergetag '): next_line = readline() while next_line.startswith(b' '): next_line = readline() # end skip mergetags # now we can have the encoding line, or an empty line followed by the optional # message. self.encoding = self.default_encoding self.gpgsig = None # read headers enc = next_line buf = enc.strip() while buf: if buf[0:10] == b"encoding ": self.encoding = buf[buf.find(' ') + 1:].decode('ascii') elif buf[0:7] == b"gpgsig ": sig = buf[buf.find(b' ') + 1:] + b"\n" is_next_header = False while True: sigbuf = readline() if not sigbuf: break if sigbuf[0:1] != b" ": buf = sigbuf.strip() is_next_header = True break sig += sigbuf[1:] # end read all signature self.gpgsig = sig.rstrip(b"\n").decode('ascii') if is_next_header: continue buf = readline().strip() # decode the authors name try: self.author, self.authored_date, self.author_tz_offset = \ parse_actor_and_date(author_line.decode(self.encoding, 'replace')) except UnicodeDecodeError: log.error("Failed to decode author line '%s' using encoding %s", author_line, self.encoding, exc_info=True) try: self.committer, self.committed_date, self.committer_tz_offset = \ parse_actor_and_date(committer_line.decode(self.encoding, 'replace')) except UnicodeDecodeError: log.error("Failed to decode committer line '%s' using encoding %s", committer_line, self.encoding, exc_info=True) # END handle author's encoding # a stream from our data simply gives us the plain message # The end of our message stream is marked with a newline that we strip self.message = stream.read() try: self.message = self.message.decode(self.encoding, 'replace') except UnicodeDecodeError: log.error("Failed to decode message '%s' using encoding %s", self.message, self.encoding, exc_info=True) # END exception handling return self
def stream(self, sha): """For now, all lookup is done by git itself""" hexsha, typename, size, stream = self._git.stream_object_data(bin_to_hex(sha)) return OStream(hex_to_bin(hexsha), typename, size, stream)
def _h2b(prefix): return hex_to_bin(_pad(prefix))
def test_list(self): assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)[hex_to_bin('5117c9c8a4d3af19a9958677e45cda9269de1541')], Commit)
def info(self, sha): hexsha, typename, size = self._git.get_object_header(bin_to_hex(sha)) return OInfo(hex_to_bin(hexsha), typename, size)
def rev_parse(repo, rev): """ :return: Object at the given revision, either Commit, Tag, Tree or Blob :param rev: git-rev-parse compatible revision specification, please see http://www.kernel.org/pub/software/scm/git/docs/git-rev-parse.html for details :note: Currently there is no access to the rev-log, rev-specs may only contain topological tokens such ~ and ^. :raise BadObject: if the given revision could not be found :raise ValueError: If rev couldn't be parsed :raise IndexError: If invalid reflog index is specified""" # colon search mode ? if rev.startswith(':/'): # colon search mode raise NotImplementedError("commit by message search ( regex )") # END handle search obj = None ref = None output_type = "commit" start = 0 parsed_to = 0 lr = len(rev) while start < lr: if rev[start] not in "^~:@": start += 1 continue # END handle start token = rev[start] if obj is None: # token is a rev name if start == 0: ref = repo.head.ref else: if token == '@': ref = name_to_object(repo, rev[:start], return_ref=True) else: obj = name_to_object(repo, rev[:start]) #END handle token #END handle refname if ref is not None: obj = ref.commit #END handle ref # END initialize obj on first token start += 1 # try to parse {type} if start < lr and rev[start] == '{': end = rev.find('}', start) if end == -1: raise ValueError("Missing closing brace to define type in %s" % rev) output_type = rev[start+1:end] # exclude brace # handle type if output_type == 'commit': pass # default elif output_type == 'tree': try: obj = to_commit(obj).tree except (AttributeError, ValueError): pass # error raised later # END exception handling elif output_type in ('', 'blob'): if obj.type == 'tag': obj = deref_tag(obj) else: # cannot do anything for non-tags pass # END handle tag elif token == '@': # try single int assert ref is not None, "Require Reference to access reflog" revlog_index = None try: # transform reversed index into the format of our revlog revlog_index = -(int(output_type)+1) except ValueError: # TODO: Try to parse the other date options, using parse_date # maybe raise NotImplementedError("Support for additional @{...} modes not implemented") #END handle revlog index try: entry = ref.log_entry(revlog_index) except IndexError: raise IndexError("Invalid revlog index: %i" % revlog_index) #END handle index out of bound obj = Object.new_from_sha(repo, hex_to_bin(entry.newhexsha)) # make it pass the following checks output_type = None else: raise ValueError("Invalid output type: %s ( in %s )" % (output_type, rev)) # END handle output type # empty output types don't require any specific type, its just about dereferencing tags if output_type and obj.type != output_type: raise ValueError("Could not accomodate requested object type %r, got %s" % (output_type, obj.type)) # END verify ouput type start = end+1 # skip brace parsed_to = start continue # END parse type # try to parse a number num = 0 if token != ":": found_digit = False while start < lr: if rev[start] in digits: num = num * 10 + int(rev[start]) start += 1 found_digit = True else: break # END handle number # END number parse loop # no explicit number given, 1 is the default # It could be 0 though if not found_digit: num = 1 # END set default num # END number parsing only if non-blob mode parsed_to = start # handle hiererarchy walk try: if token == "~": obj = to_commit(obj) for item in xrange(num): obj = obj.parents[0] # END for each history item to walk elif token == "^": obj = to_commit(obj) # must be n'th parent if num: obj = obj.parents[num-1] elif token == ":": if obj.type != "tree": obj = obj.tree # END get tree type obj = obj[rev[start:]] parsed_to = lr else: raise ValueError("Invalid token: %r" % token) # END end handle tag except (IndexError, AttributeError): raise BadObject("Invalid Revision in %s" % rev) # END exception handling # END parse loop # still no obj ? Its probably a simple name if obj is None: obj = name_to_object(repo, rev) parsed_to = lr # END handle simple name if obj is None: raise ValueError("Revision specifier could not be parsed: %s" % rev) if parsed_to != lr: raise ValueError("Didn't consume complete rev spec %s, consumed part: %s" % (rev, rev[:parsed_to])) return obj