Exemplo n.º 1
0
def create_gst_on_file(filename, gprint=False, strip=False):
    """
    Opens a file and create a suffix tree on every string
    """
    st = SuffixTree()

    print "Opening file \"{0}\".".format(filename)
    time_start = default_timer()

    with open(filename) as text_file:
        i = -1
        for line in text_file:
            st.add_string(line.strip() if strip else line)#.strip())#.strip())
            i += 1

            if i % 100000 == 0:
                print "\tProcessed {0} elements".format(i)

    if gprint:
        g = Grapher(st)
        g.createGraphviz()

    print "Suffix tree for \"{0}\" complete in {1} seconds".format(filename, default_timer() - time_start)

    return st
Exemplo n.º 2
0
def length_distribution_on_suffix(filename, adaptersequence):
    st = SuffixTree()
    number_of_matches = 0
    length_distribution = {}

    #Reverse adaptersequence to create prefixtree
    reversed_adaptersequence = adaptersequence[::-1]
    st.add_string(reversed_adaptersequence)

    #Loop through the sequences in the file
    for line in generate_strings(filename):
        reversed_line = line[::-1]
        #Get longest suffix-prefix match for given string
        longest_match = st.find_prefixmatch_nr(reversed_line, st.root, 0.0)
        #Check number of matches
        length_match = len(longest_match)
        if length_match > 0:
            number_of_matches += 1
        length_rest = len(line) - length_match
        if length_rest in length_distribution:
            length_distribution[length_rest] += 1
        else:
            length_distribution[length_rest] = 1

    return number_of_matches, length_distribution