forked from halfak/deltas
/
demonstrate_speed.py
130 lines (113 loc) · 4.65 KB
/
demonstrate_speed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import cProfile as profile
import random
import time
import pickle
from deltas import segment_matcher, sequence_matcher
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas.tokenizers import wikitext_split, text_split
from mw import api
segmenter = ParagraphsSentencesAndWhitespace()
session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']
common2 = session.revisions.get(638077284, properties={"content"})['*']
common1_tokens = list(wikitext_split.tokenize(common1))
common2_tokens = list(wikitext_split.tokenize(common2))
words = [l.strip() for l in open('/usr/share/dict/words')]
random1 = ''.join(random.choice(words) if t.type == "word" else str(t)
for t in common1_tokens)
random2 = ''.join(random.choice(words) if t.type == "word" else str(t)
for t in common1_tokens)
random2_tokens = list(wikitext_split.tokenize(random2))
random1_tokens = list(wikitext_split.tokenize(random1))
print("Tokenizing:")
def tokenize_common():
start = time.time()
for _ in range(50):
tokens = list(text_split.tokenize(common1))
print("\ttext_split: {0}".format((time.time() - start)/50))
start = time.time()
for _ in range(50):
tokens = list(wikitext_split.tokenize(common1))
print("\twikitext_split: {0}".format((time.time() - start)/50))
tokenize_common()
#profile.run('segment_common()', sort="cumulative")
print("Pickling segments:")
def segments_pickle():
segments = segmenter.segment(common1_tokens)
pickled_segments = pickle.dumps(segments)
start = time.time()
for _ in range(25):
pickled_segments = pickle.dumps(segments)
print("\tpickling: {0}".format((time.time() - start)/25))
for _ in range(25):
unpickled_segments = pickle.loads(pickled_segments)
print("\tunpickling: {0}".format((time.time() - start)/25))
segments_pickle()
#profile.run('segment_common()', sort="cumulative")
print("Running sequence matcher (LCS):")
def sequence_common():
start = time.time()
for _ in range(25):
operations = list(sequence_matcher.diff(common1_tokens, common2_tokens))
print("\tcommon: {0}".format((time.time() - start)/25))
sequence_common()
#profile.run('sequence_common()', sort="cumulative")
def sequence_random():
start = time.time()
for _ in range(25):
operations = list(sequence_matcher.diff(random1_tokens, random2_tokens))
print("\trandom: {0}".format((time.time() - start)/25))
#sequence_random()
#profile.run('sequence_random()', sort="cumulative")
print("Segmenting:")
def segment_common():
start = time.time()
for _ in range(25):
segments = list(segmenter.segment(common1_tokens))
print("\tcommon: {0}".format((time.time() - start)/25))
segment_common()
#profile.run('segment_common()', sort="cumulative")
print("Running segment matcher:")
def segment_common():
start = time.time()
for _ in range(25):
operations = list(segment_matcher.diff(common1_tokens, common2_tokens))
print("\tcommon: {0}".format((time.time() - start)/25))
segment_common()
#profile.run('segment_common()', sort="cumulative")
def segment_common_fast():
start = time.time()
sm = segment_matcher.SegmentMatcher()
processor = sm.processor()
for _ in range(25):
operations = list(processor.process(common1))
operations = list(processor.process(common2))
print("\tcommon_fast: {0}".format((time.time() - start)/50))
segment_common_fast()
#profile.run('segment_common()', sort="cumulative")
def segment_random():
start = time.time()
for _ in range(25):
operations = list(segment_matcher.diff(random1_tokens, random2_tokens))
print("\trandom: {0}".format((time.time() - start)/25))
#segment_random()
#profile.run('segment_random()', sort="cumulative")
common1_segments = segmenter.segment(common1_tokens)
common2_segments = segmenter.segment(common2_tokens)
random1_segments = segmenter.segment(random1_tokens)
random2_segments = segmenter.segment(random2_tokens)
print("Running segment matcher (post segmentation):")
def segment_common_seg():
start = time.time()
for _ in range(25):
operations = list(segment_matcher.diff_segments(common1_segments, common2_segments))
print("\tcommon: {0}".format((time.time() - start)/25))
segment_common_seg()
#profile.run('segment_common_seg()', sort="cumulative")
def segment_random_seg():
start = time.time()
for _ in range(25):
operations = list(segment_matcher.diff_segments(random1_segments, random2_segments))
print("\trandom: {0}".format((time.time() - start)/25))
#segment_random_seg()
#profile.run('segment_random()', sort="cumulative")