This repository has been archived by the owner on Apr 21, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Utility.py
135 lines (102 loc) · 3.92 KB
/
Utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from algorithms.suffixtree.SuffixTree import SuffixTree
from algorithms.suffixtree.SuffixGrapher import Grapher
from contextlib import contextmanager
from timeit import default_timer
from timeit import timeit
from Queue import Queue
@contextmanager
def elapsed_timer():
"""
Timer from http://stackoverflow.com/questions/7370801/measure-time-elapsed-in-python
"""
start = default_timer()
elapser = lambda: default_timer() - start
yield lambda: elapser()
end = default_timer()
elapser = lambda: end-start
def create_gst_on_file(filename, gprint=False, strip=False):
"""
Opens a file and create a suffix tree on every string
"""
st = SuffixTree()
print "Opening file \"{0}\".".format(filename)
time_start = default_timer()
with open(filename) as text_file:
i = -1
for line in text_file:
st.add_string(line.strip() if strip else line)#.strip())#.strip())
i += 1
if i % 100000 == 0:
print "\tProcessed {0} elements".format(i)
if gprint:
g = Grapher(st)
g.createGraphviz()
print "Suffix tree for \"{0}\" complete in {1} seconds".format(filename, default_timer() - time_start)
return st
def generate_strings(filename):
"""
Generator for each string seperated by "\n" in filename.
"""
st = SuffixTree()
with open(filename) as text_file:
for line in text_file:
yield line.strip()
def count_and_show_suffixes(suffix_tree):
queue = Queue()
queue.put((suffix_tree.root, ""))
suffixes = {}
while not queue.empty():
cNode, label = queue.get()
if len(cNode.edges) == 0:
if len(label) < 20 or cNode.suffixes < 1000 or label == '$':
continue
suffixes[label] = cNode.suffixes
for key, nNode in cNode.edges.iteritems():
#print nNode.suffixes_visited_by
newLabel = str(label) + suffix_tree.get_internal_subtring(nNode, nNode.start, nNode.end)
queue.put((nNode, newLabel))
# Sort them suffixes
sortedSuffixes = sorted(suffixes.items(), key=lambda x: int(x[1]))
for (label, value) in sortedSuffixes:
l = label.replace('\n', '')
print "{0}:\t{2}\t{1}".format(value, l, len(l))
def plot_graph_from_length_distribution(length_distribution, name=False):
import matplotlib.pyplot as plt
plt.bar(range(len(length_distribution)), length_distribution.values(), align='center')
plt.xticks(range(len(length_distribution)), length_distribution.keys())
plt.xlabel("Length")
plt.ylabel("Count")
plt.show()
if name:
plt.savefig(name)
def csv_distribution(distribution, name="distribution.csv"):
import csv
with open(name, 'wb') as d_file:
wr = csv.writer(d_file, delimiter=',', quoting=csv.QUOTE_ALL)
wr.writerow(['length', 'count'])
for length, count in distribution.iteritems():
wr.writerow([length, count])
def length_distribution_on_suffix(filename, adaptersequence):
st = SuffixTree()
number_of_matches = 0
length_distribution = {}
#Reverse adaptersequence to create prefixtree
reversed_adaptersequence = adaptersequence[::-1]
st.add_string(reversed_adaptersequence)
#Loop through the sequences in the file
for line in generate_strings(filename):
reversed_line = line[::-1]
#Get longest suffix-prefix match for given string
longest_match = st.find_prefixmatch_nr(reversed_line, st.root, 0.0)
#Check number of matches
length_match = len(longest_match)
if length_match > 0:
number_of_matches += 1
length_rest = len(line) - length_match
if length_rest in length_distribution:
length_distribution[length_rest] += 1
else:
length_distribution[length_rest] = 1
return number_of_matches, length_distribution