def test_initialize_simple(self): logging.debug('Testing initializing a simple file blob') reset_storage() fb = file_blob() fb.my_hash = '1' expected_string = '\n=========Printing details of file blob object=========\nmy_hash: 1\nparent_hash: \nblob_type: file\n------------------------------------------------------' self.assertEqual(expected_string, str(fb))
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info( 'working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) #@TODO: Write to a temp directory first and then cut to the working directory? Would ensure user has very little possibility to see on partial files. cb = commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to restore:\n%s' % (str(tb))) tb.write_folders(working_directory) for path, node in tb.walk(): if node.node_type != 'file': continue fb = file_blob() fb.load(key, storage_directory, node.hash_hex) full_file_path = os.path.join(working_directory, path) #full_file_path = working_directory + path + '/' + file_name f = open(full_file_path, 'wb') f.write(fb.apply_delta(key, storage_directory))
def test_initialize_simple(self): logging.debug('Testing initializing a simple file blob') reset_storage() fb=file_blob() fb.my_hash='1' expected_string = '\n=========Printing details of file blob object=========\nmy_hash: 1\nparent_hash: \nblob_type: file\n------------------------------------------------------' self.assertEqual(expected_string, str(fb))
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info('working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) #@TODO: Write to a temp directory first and then cut to the working directory? Would ensure user has very little possibility to see on partial files. cb=commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to restore:\n%s'%(str(tb))) tb.write_folders(working_directory) for path, node in tb.walk(): if node.node_type != 'file': continue fb = file_blob() fb.load(key, storage_directory, node.hash_hex) full_file_path = os.path.join(working_directory, path) #full_file_path = working_directory + path + '/' + file_name f=open(full_file_path,'wb'); f.write(fb.apply_delta(key, storage_directory))
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info( 'working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) cb = commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.display() file_listing = tb.write_directory_structure(key, storage_directory, working_directory) #restore files for (path, file_name, hash_hex, file_size) in file_listing: fb = file_blob() fb.load(key, storage_directory, hash_hex) #full_file_path = os.path.join(working_directory_path, path, file_name) full_file_path = working_directory + path + '/' + file_name f = open(full_file_path, 'wb') f.write(fb.apply_delta(key, storage_directory))
def blobs_to_restore_blob(self, key, storage_directory, file_name, parent_tree=True): """ Returns a list file names of any blobs needed to restore a blob. If an empty list is returned then it should be possible to restore the given blob from local files. Will likely need to be called repeatedly on a commit because needed blobs also have dependent blobs which wont be known until the needed blobs are obtained. """ if (not os.path.exists(os.path.join(storage_directory, file_name))): return [file_name] #Check if this is a commit blob if file_name[0] == '_': #Get the tree. Parent commits are not needed to restore a commit, or its files. cb = commit_blob() cb.load(key, storage_directory, file_name[1:]) return self.blobs_to_restore_blob(key, storage_directory, cb.tree_hash) fb = file_blob() fb.load(key, storage_directory, file_name) #check if this is a tree blob if fb.blob_type == 'tree': tb = tree_blob() tb.load(key, storage_directory, file_name) #TODO: can this be casted? if not tb.parent_hash == '': #check for all parents of tree needed_tree_parent_hash = self.blobs_to_restore_blob( key, storage_directory, tb.parent_hash, False) if not (needed_tree_parent_hash == None): return needed_tree_parent_hash if not parent_tree: #only traverse tree structure of parent tree return None #check for all files in tree structure file_hashes = tb.file_hashes(key, storage_directory) #(unused_path, unused_file_name, hash_hex, unused_file_size) = tb.write_directory_structure(key, storage_directory, None, False) needed_files = [] for h in file_hashes: temp_hash = self.blobs_to_restore_blob(key, storage_directory, h) if not temp_hash == None: needed_files.extend(temp_hash) if needed_files == []: return None return needed_files #fb is a file blob if fb.parent_hash == '': return None else: return self.blobs_to_restore_blob(key, storage_directory, fb.parent_hash)
def test_delta_simple(self): logging.debug('Testing a delta on a file blob') reset_storage() f=open('../resource/sample_text_1.txt','rb') fb=file_blob() fb.my_hash='1' fb.compute_delta(key,f.read()) expected_string = '\n=========Printing details of file blob object=========\nmy_hash: a1f7f4060ff3f7c0b7c28b4a37990a851a225e475d47da7a0d82dec5\nparent_hash: \nblob_type: file\naggregate a[0:0] b[0:0] first line\nabcdefghijklmnopqrstuvwxyz\nthird line\nfourth\nlast line------------------------------------------------------' self.assertEqual(expected_string, str(fb))
def test_delta_simple(self): logging.debug('Testing a delta on a file blob') reset_storage() f = open('../resource/sample_text_1.txt', 'rb') fb = file_blob() fb.my_hash = '1' fb.compute_delta(key, f.read()) expected_string = '\n=========Printing details of file blob object=========\nmy_hash: a1f7f4060ff3f7c0b7c28b4a37990a851a225e475d47da7a0d82dec5\nparent_hash: \nblob_type: file\naggregate a[0:0] b[0:0] first line\nabcdefghijklmnopqrstuvwxyz\nthird line\nfourth\nlast line------------------------------------------------------' self.assertEqual(expected_string, str(fb))
def blobs_to_restore_blob(self, key, storage_directory, file_name, parent_tree = True): """ Returns a list file names of any blobs needed to restore a blob. If an empty list is returned then it should be possible to restore the given blob from local files. Will likely need to be called repeatedly on a commit because needed blobs also have dependent blobs which wont be known until the needed blobs are obtained. """ if (not os.path.exists(os.path.join(storage_directory,file_name))): return [file_name] #Check if this is a commit blob if file_name[0]=='_': #Get the tree. Parent commits are not needed to restore a commit, or its files. cb = commit_blob() cb.load(key, storage_directory, file_name[1:]) return self.blobs_to_restore_blob(key, storage_directory, cb.tree_hash) fb = file_blob() fb.load(key, storage_directory, file_name) #check if this is a tree blob if fb.blob_type=='tree': tb=tree_blob() tb.load(key, storage_directory, file_name) #TODO: can this be casted? if not tb.parent_hash=='': #check for all parents of tree needed_tree_parent_hash = self.blobs_to_restore_blob(key, storage_directory, tb.parent_hash, False) if not (needed_tree_parent_hash == None): return needed_tree_parent_hash if not parent_tree: #only traverse tree structure of parent tree return None #check for all files in tree structure file_hashes = tb.file_hashes(key, storage_directory) #(unused_path, unused_file_name, hash_hex, unused_file_size) = tb.write_directory_structure(key, storage_directory, None, False) needed_files = [] for h in file_hashes: temp_hash = self.blobs_to_restore_blob(key, storage_directory, h) if not temp_hash==None: needed_files.extend(temp_hash) if needed_files==[]: return None return needed_files #fb is a file blob if fb.parent_hash=='': return None else: return self.blobs_to_restore_blob(key, storage_directory, fb.parent_hash)
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info('working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) cb=commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.display() file_listing = tb.write_directory_structure(key, storage_directory, working_directory) #restore files for (path, file_name, hash_hex, file_size) in file_listing: fb = file_blob() fb.load(key, storage_directory, hash_hex) #full_file_path = os.path.join(working_directory_path, path, file_name) full_file_path = working_directory + path + '/' + file_name f=open(full_file_path,'wb'); f.write(fb.apply_delta(key, storage_directory))
for root, dirs, files in os.walk(peer_C_storage, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) os.mkdir(os.path.join(peer_A_storage, 'test_share')) print '\n\n' print '************************************************************************' print '***Testing initializing a file blob' logging.debug('Testing initializing a file blob') print '************************************************************************' #open a text file and import into file blob f = open('../resource/sample_text_1.txt', 'rb') fb = file_blob() fb.my_hash = '1' fb.display() fb.compute_delta(key, f.read()) fb.display() print '\n\n' print '************************************************************************' print '***Testing a simple file delta' logging.debug('Testing a simple file delta') print '************************************************************************' #open a 2nd version of text file and compute delta from first version f2 = open('../resource/sample_text_2.txt', 'rb') fb2 = file_blob() fb2.my_hash = '2' fb2.display()
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info( 'commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob( ) #it is okay to modify this tree blob. The one stored for the commit is already saved. tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to store: %s' % (tb)) #removed files with duplicate hashes in working directory or storage directory hash_set = set() for root, dirs, files in os.walk(storage_directory): for f in files: hash_set.add(f) logging.debug('storage directory hashes: %s' % (hash_set)) for path, node in tb.walk(): logging.debug('checking: %s' % (path)) if node.node_type != 'file': continue if node.hash_hex in hash_set: logging.debug('found hash match: %s' % (node.hash_hex)) tb.rm_node(path, 'file') else: hash_set.add(node.hash_hex) if parent_commit_hash == None: #this is an initial commit for path, node in tb.walk(): if node.node_type != 'file': continue full_path = os.path.join(working_directory, path) new_file = open(full_path, 'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) ptb.build_tree(key, storage_directory) logging.debug('performing differential commit using following trees') logging.debug('parent tree: %s' % (str(ptb))) logging.debug('child tree: %s' % (str(tb))) #find files with the same name in the same path, compute deltas, and store as file blob diffs for path, node in tb.walk(): if node.node_type != 'file': continue if not ptb.has_node(path, 'file'): continue p_node = ptb.get_node(path, 'file') logging.debug( 'Found files with matching paths and names. working: %s, parent: %s', node.hash_hex, p_node.hash_hex) full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path, 'rb') pfb = file_blob() pfb.load(key, storage_directory, p_node.hash_hex) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) tb.rm_node(path, 'file') #TODO: re-implement code commented below """ #Look for similar files between working and parent and compute diffs on those index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs(file_sizes[index]-parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s=difflib.SequenceMatcher(None,new_file_text,pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug('Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break """ #store all remaining files as initial versions for path, node in tb.walk(): if node.node_type != 'file': continue full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path, 'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) os.mkdir(os.path.join(peer_A_storage, "test_share")) print "\n\n" print "************************************************************************" print "***Testing initializing a file blob" logging.debug("Testing initializing a file blob") print "************************************************************************" # open a text file and import into file blob f = open("../resource/sample_text_1.txt", "rb") fb = file_blob() fb.my_hash = "1" fb.display() fb.compute_delta(key, f.read()) fb.display() print "\n\n" print "************************************************************************" print "***Testing a simple file delta" logging.debug("Testing a simple file delta") print "************************************************************************" # open a 2nd version of text file and compute delta from first version f2 = open("../resource/sample_text_2.txt", "rb") fb2 = file_blob() fb2.my_hash = "2"
def merge(tb_A, tb_B, tb_C, merge_method = 'dupe_on_conflict'): """Merges two trees. C is common ancestor merge_method determines how conflicts are handled. dupe_on_conflict: default. try merging. if a conflict can't be resolved automatically save both file versions. dupe_always: if two folders/files need to be merged always save both copies force_merge: try merging. in case of conflict resolve automatically even if result contains unified diff sections. """ """ Algorithm -------------- A: update commit B: old commit C: common ancestor commit N: new tree -store trees from A, B, C, and N in easily traversable data structure -iterate through folders and files from A (from top down. these iterations only add to wd) -if B has it -no change necessary -add to N as is -if C has it, but B doesn't -means B deleted file/folder -don't add to N -if neither C nor B have it -means A added it -push change to wd -add to N -if B has same file name and path, but different hash as A -attempt merge -add merged file(s) to N -iterate through folders and files from B (from bottom up. these iterations only delete from wd) -if neither C nor A have it -means B added it -add to N -no change necessary -if C has it, but A doesn't -means A deleted file/folder -delete file/folder from wd -commit wd as merge -issues with algorithm -B moves file, A edits file """ merge_tb = tree_blob() merge_tb.root_node = tree_blob.TreeNode() merge_tb.root_node.name = tb_A.root_node.name merge_tb.root_node.node_type = 'folder' merge_tb.root_node.size=0 print 'walking tree' for path, node in tb_A.walk(): print path if path == merge_tb.root_node.name: continue if tb_B.has_node(path, node.node_type) and (tb_B.get_node(path, node.node_type)).hash_hex == node.hash_hex: #not modified merge_tb.add_node(path, node.node_type, node.hash_hex, int(node.size)) continue if tb_B.has_node(path, node.node_type) and (tb_B.get_node(path, node.node_type)).hash_hex != node.hash_hex: #merge files hash_hex_B = (tb_B.get_node(path, node.node_type)).hash_hex fb_B = file_blob() fb_B.load(key, storage, hash_hex_B) fb_A = file_blob() fb_A.load(key, storage, node.hash_hex) if tb_C.has_node(path, node.node_type): hash_hex_C = (tb_C.get_node(path, node.node_type)).hash_hex fb_C = file_blob() fb_C.load(key, storage, hash_hex_C) else: fb_C = fb_B merge_text = file_blob.merge(fb_A.apply_delta(key, storage), fb_B.apply_delta(key, storage), fb_C.apply_delta(key, storage)) fb_merge = file_blob() fb_merge.compute_delta(key, merge_text, fb_A, storage) merge_tb.add_node(path, node.node_type, node.hash_hex, int(node.size)) continue if tb_C.has_node(path, node.node_type): #B deleted continue merge_tb.add_node(path, node.node_type, node.hash_hex, int(node.size)) #A added for path, node in tb_B.walk(): print path if path == merge_tb.root_node.name: continue if (not tb_C.has_node(path, node.node_type)) and (not tb_A.has_node(path, node.node_type)): #B added merge_tb.add_node(path, node.node_type, node.hash_hex, int(node.size)) return merge_tb
for i in range(6): try: print s2.recv(1028) except socket.error as msg: pass """ print '\n\n' print '************************************************************************' print '***Testing sending a large file between two sibs' logging.debug('Testing sending a large file between two sibs') print '************************************************************************' large_file = open('../resource/alice.txt', 'rb') fb = file_blob() fb.compute_delta(key, large_file.read()) large_file_hash = fb.store(key, os.path.join(peer_B_storage, 'test_share')) time.sleep(1) sib_b = SIB() sib_b.new_sockets.put(11120) sib_b.js.storage_directory = peer_B_storage sib_b.js.my_machine_ID = 'machine_B' sib_a = SIB() sib_a.new_sockets.put(11121) sib_a.js.storage_directory = peer_A_storage sib_a.js.my_machine_ID = 'machine_A' #TODO: this transfer is slow?
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info('commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob() #it is okay to modify this tree blob. The one stored for the commit is already saved. tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to store: %s'%(tb)) #removed files with duplicate hashes in working directory or storage directory hash_set = set() for root, dirs, files in os.walk(storage_directory): for f in files: hash_set.add(f) logging.debug('storage directory hashes: %s'%(hash_set)) for path, node in tb.walk(): logging.debug('checking: %s'%(path)) if node.node_type != 'file': continue if node.hash_hex in hash_set: logging.debug('found hash match: %s'%(node.hash_hex)) tb.rm_node(path, 'file') else: hash_set.add(node.hash_hex) if parent_commit_hash==None: #this is an initial commit for path, node in tb.walk(): if node.node_type !='file': continue full_path = os.path.join(working_directory, path) new_file = open(full_path,'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) ptb.build_tree(key, storage_directory) logging.debug('performing differential commit using following trees') logging.debug('parent tree: %s' %(str(ptb))) logging.debug('child tree: %s' %(str(tb))) #find files with the same name in the same path, compute deltas, and store as file blob diffs for path, node in tb.walk(): if node.node_type != 'file': continue if not ptb.has_node(path, 'file'): continue p_node = ptb.get_node(path, 'file') logging.debug('Found files with matching paths and names. working: %s, parent: %s', node.hash_hex, p_node.hash_hex) full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path,'rb') pfb = file_blob() pfb.load(key, storage_directory, p_node.hash_hex) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) tb.rm_node(path, 'file') #TODO: re-implement code commented below """ #Look for similar files between working and parent and compute diffs on those index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs(file_sizes[index]-parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s=difflib.SequenceMatcher(None,new_file_text,pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug('Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break """ #store all remaining files as initial versions for path, node in tb.walk(): if node.node_type !='file': continue full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path,'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info( 'commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) file_listing = tb.write_directory_structure(key, storage_directory, None, False) file_hashes = [] file_names = [] file_folders = [] file_sizes = [] for (path, file_name, hash_hex, file_size) in file_listing: file_hashes.append(hash_hex) file_names.append(file_name) file_folders.append(path) file_sizes.append(file_size) if parent_commit_hash == None: #this is an initial commit #store all remaining files as initial versions index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) file_listing = ptb.write_directory_structure(key, storage_directory, None, False) parent_file_hashes = [] parent_file_names = [] parent_file_folders = [] parent_file_sizes = [] for (path, file_name, hash_hex, file_size) in file_listing: parent_file_hashes.append(hash_hex) parent_file_names.append(file_name) parent_file_folders.append(path) parent_file_sizes.append(file_size) #Find file blob matches and similar file blobs #remove duplicate hashes in working directory index = -1 while True: index += 1 if index >= len(file_hashes) - 1: break if file_hashes[index] in file_hashes[index + 1:]: logging.debug( 'Found duplicate files within working directory. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 #remove duplicate hashes in working directory vs storage directory index = -1 while True: index += 1 if index >= len(file_hashes) - 1: break if file_hashes[index] in parent_file_hashes: logging.debug( 'Found duplicate file already stored. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 #find files with the same name in the same path, compute deltas, and store as file blob diffs index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 parent_index = -1 while (parent_index + 1 < len(parent_file_hashes) ): #cycle through all files records in parent commit parent_index += 1 if file_names[index] != parent_file_names[parent_index]: continue if file_folders[index] != parent_file_folders[parent_index]: continue #If this line is reached we found two files with the same name, path, but different hashes. #Compute the diff between these two files and store it. logging.debug( 'Found files with matching paths and names. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'rb') pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 break #Look for similar files between working and parent and compute diffs on those index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 parent_index = -1 while (parent_index + 1 < len(parent_file_hashes) ): #cycle through all files records in parent commit parent_index += 1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs( file_sizes[index] - parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s = difflib.SequenceMatcher(None, new_file_text, pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug( 'Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 break #store all remaining files as initial versions index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info('commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) file_listing=tb.write_directory_structure(key, storage_directory, None, False) file_hashes=[] file_names=[] file_folders=[] file_sizes=[] for (path, file_name, hash_hex, file_size) in file_listing: file_hashes.append(hash_hex) file_names.append(file_name) file_folders.append(path) file_sizes.append(file_size) if parent_commit_hash==None: #this is an initial commit #store all remaining files as initial versions index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) file_listing=ptb.write_directory_structure(key, storage_directory, None, False) parent_file_hashes=[] parent_file_names=[] parent_file_folders=[] parent_file_sizes=[] for (path, file_name, hash_hex, file_size) in file_listing: parent_file_hashes.append(hash_hex) parent_file_names.append(file_name) parent_file_folders.append(path) parent_file_sizes.append(file_size) #Find file blob matches and similar file blobs #remove duplicate hashes in working directory index=-1 while True: index+=1 if index>=len(file_hashes)-1: break if file_hashes[index] in file_hashes[index+1:]: logging.debug('Found duplicate files within working directory. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 #remove duplicate hashes in working directory vs storage directory index=-1 while True: index+=1 if index>=len(file_hashes)-1: break if file_hashes[index] in parent_file_hashes: logging.debug('Found duplicate file already stored. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 #find files with the same name in the same path, compute deltas, and store as file blob diffs index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 if file_names[index]!= parent_file_names[parent_index]: continue if file_folders[index]!=parent_file_folders[parent_index]: continue #If this line is reached we found two files with the same name, path, but different hashes. #Compute the diff between these two files and store it. logging.debug('Found files with matching paths and names. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break #Look for similar files between working and parent and compute diffs on those index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs(file_sizes[index]-parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s=difflib.SequenceMatcher(None,new_file_text,pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug('Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break #store all remaining files as initial versions index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
try: print s2.recv(1028) except socket.error as msg: pass """ print '\n\n' print '************************************************************************' print '***Testing sending a large file between two sibs' logging.debug('Testing sending a large file between two sibs') print '************************************************************************' large_file = open('../resource/alice.txt','rb') fb=file_blob() fb.compute_delta(key,large_file.read()) large_file_hash = fb.store(key, os.path.join(peer_B_storage, 'test_share')) time.sleep(1) sib_b = SIB() sib_b.new_sockets.put(11120) sib_b.js.storage_directory = peer_B_storage sib_b.js.my_machine_ID = 'machine_B' sib_a = SIB() sib_a.new_sockets.put(11121) sib_a.js.storage_directory = peer_A_storage sib_a.js.my_machine_ID = 'machine_A' #TODO: this transfer is slow?