def longest_strand(*args): """ Given a large number of binary files, this program finds the longest strand of bytes that is identical between two or more files by constructing a Generalized Suffix Tree. The program then calls longest_strand to print the longest strand of bytes. :param args: a list of binary files :return: Prints the following: - the longest strand of bytes that is identical between two or more files - the length of the strand - the file names where the largest strand appears - the offset where the strand appears in each file """ file_length = len(args) file_dict = {} if file_length < 2: print("Error! Need more than 2 binary files.") return # Accessing test files in a different directory current_directory = os.getcwd() file_path = current_directory + '/test_samples/' # Looping through each file and storing the file's name as the key and the hex data as the value in a dictionary for file in args: with open(file_path + file, 'rb') as f: hex_data = f.read().hex() file_dict[file] = hex_data # Building the Suffix Tree hex_data_list = list(file_dict.values()) s_tree = suffix_tree.SuffixTree(hex_data_list) longest_strand, indexes, offset_list = s_tree.longest_strand() print("The longest strand is:", longest_strand) print("The length of the strand is:", len(longest_strand)) # Figuring out the offset where the strand appears in each file file_names, iter_offset_list = list(file_dict.keys()), iter(offset_list) file_length = 0 helper_list = [] for file_number in indexes: file = file_names[file_number] helper_list.append(file) for file in file_names: if file in helper_list: print("The largest strand appears in:", file, "and the offset is", next(iter_offset_list) - file_length) file_length += len(file_dict[file]) + 1
def construct_tree(string): tree = suffix_tree.SuffixTree(string + "$") tree.update_counter() return tree
if __name__ == '__main__': """ Test case to see if the Suffix Tree implementation works correctly. Give the input list of ["33", "AA2222", "BBB2222", "CCC33333"], Should output: The longest strand of bytes that is identical between two or more files is: 2222 The length of the strand is: 4 The largest strand appears in file: 2 and the offset is 2 The largest strand appears in file: 3 and the offset is 3 """ test = ["33", "AA2222", "BBB2222", "CCC33333"] suffix_tree = suffix_tree.SuffixTree(test) longest_strand, indexes, offset_list = suffix_tree.longest_strand() print("The longest strand of bytes that is identical between two or more files:", longest_strand) print("The length of the strand is:", len(longest_strand)) file_length, counter = 0, 0 iter_offset_list = iter(offset_list) for file in test: if counter in indexes: print("The largest strand appears in file", counter + 1, "and the offset is", next(iter_offset_list) - file_length) file_length += len(file) + 1 counter += 1
import suffix_tree #text = 'ATAAATG$' #with open('suffixTree.txt', 'r') as f: with open('dataset_96_5.txt', 'r') as f: text = f.readline().strip() tree = suffix_tree.SuffixTree() for i in xrange(len(text)): tree.insert(text[i:], i) #tree.print_tree() #print tree._nodes[0].keys() for item in tree._nodes: if isinstance(item, dict): print '\n'.join(item.keys())