def longest_strand(*args):
    """
    Given a large number of binary files, this program finds the longest strand of bytes that is identical between
    two or more files by constructing a Generalized Suffix Tree. The program then calls longest_strand to print the
    longest strand of bytes.

        :param args: a list of binary files
        :return: Prints the following:
            - the longest strand of bytes that is identical between two or more files
            - the length of the strand
            - the file names where the largest strand appears
            - the offset where the strand appears in each file

    """
    file_length = len(args)
    file_dict = {}

    if file_length < 2:
        print("Error! Need more than 2 binary files.")
        return

    # Accessing test files in a different directory
    current_directory = os.getcwd()
    file_path = current_directory + '/test_samples/'

    # Looping through each file and storing the file's name as the key and the hex data as the value in a dictionary
    for file in args:
        with open(file_path + file, 'rb') as f:
            hex_data = f.read().hex()
            file_dict[file] = hex_data

    # Building the Suffix Tree
    hex_data_list = list(file_dict.values())
    s_tree = suffix_tree.SuffixTree(hex_data_list)
    longest_strand, indexes, offset_list = s_tree.longest_strand()

    print("The longest strand is:", longest_strand)
    print("The length of the strand is:", len(longest_strand))

    # Figuring out the offset where the strand appears in each file
    file_names, iter_offset_list = list(file_dict.keys()), iter(offset_list)
    file_length = 0
    helper_list = []

    for file_number in indexes:
        file = file_names[file_number]
        helper_list.append(file)

    for file in file_names:
        if file in helper_list:
            print("The largest strand appears in:", file, "and the offset is",
                  next(iter_offset_list) - file_length)
        file_length += len(file_dict[file]) + 1
Пример #2
0
def construct_tree(string):
    tree = suffix_tree.SuffixTree(string + "$")
    tree.update_counter()
    return tree
Пример #3
0
if __name__ == '__main__':
    """
    Test case to see if the Suffix Tree implementation works correctly. 
    
    Give the input list of ["33", "AA2222", "BBB2222", "CCC33333"],
    
    Should output:
    
        The longest strand of bytes that is identical between two or more files is: 2222
        The length of the strand is: 4
        The largest strand appears in file: 2 and the offset is 2
        The largest strand appears in file: 3 and the offset is 3
        
    """

    test = ["33", "AA2222", "BBB2222", "CCC33333"]
    suffix_tree = suffix_tree.SuffixTree(test)
    longest_strand, indexes, offset_list = suffix_tree.longest_strand()
    print("The longest strand of bytes that is identical between two or more files:", longest_strand)
    print("The length of the strand is:", len(longest_strand))

    file_length, counter = 0, 0
    iter_offset_list = iter(offset_list)

    for file in test:
        if counter in indexes:
            print("The largest strand appears in file", counter + 1, "and the offset is",
                  next(iter_offset_list) - file_length)
        file_length += len(file) + 1
        counter += 1
Пример #4
0
import suffix_tree

#text = 'ATAAATG$'

#with open('suffixTree.txt', 'r') as f:
with open('dataset_96_5.txt', 'r') as f:
    text = f.readline().strip()

tree = suffix_tree.SuffixTree()

for i in xrange(len(text)):
    tree.insert(text[i:], i)

#tree.print_tree()

#print tree._nodes[0].keys()

for item in tree._nodes:
    if isinstance(item, dict):
        print '\n'.join(item.keys())