forked from airdrik/PyDataTable
/
datatable_diff.py
348 lines (329 loc) · 15.1 KB
/
datatable_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
'''
Module for comparing data in two DataTables.
The main entry point is the diff method which takes the two tables and the list of headers which specifies the "primary key" for the tables
diff returns a ResultSet instance which contains the difference data by primary key.
ResultSet contains several methods for pruning expected differences and for slicing up the results to help discover patterns in the changes:
ignoreField prunes all differences for the given field
checkRemove prunes differences for the given field which match the given predicate
checkRemove_multiField prunes differences which affect multiple fields (e.g. we expect the change to move some of the value from one field to another so long as the sum is the same)
changedFields returns the list of fields which reported differences
filter takes a Result predicate and returns a new ResultSet containing the matching Results
pick returns a (pseudo-)random Result from the ResultSet
original{From|To}Rows returns a DataTable containing the rows which had differences
ResultSet also contains a few methods for formatted output:
repr(rs) returns a summary of the differences (the number of results in the collection)
str(rs) returns a concise display of the results showing specifically what changed
rs.printFormatted() prints a fixed-width-formatted tabular display of the results
ResultSet contains a list of Result instances which represents the changes to a single key entry (which may be multiple from/to rows if the "primary key" used doesn't guarantee uniqueness)
fromRow and toRow represent the entire original row (or rows)
repr(result) returns a summary of the differences (the number of from and to rows)
str(result) returns the differences - if there is one fromRow and one toRow returns the collection of fields which changed, otherwise reports the number of from and to rows
ignoreField, checkRemove and checkRemove_multiField are also available, but should be ignored on the Result object as they are called from the ResultSet when its corresponding methods are called
'''
from collections import defaultdict
from datatable import DataTable
from datatable_util import AttributeDict, sortKey
from functools import total_ordering
@total_ordering
class Result:
'''
Result class representing the difference between rows for a given bucket
Contains the key for this bucket (may be used to find the rows in the original files),
those fields which changed with the from and to values, and the actual from and to rows
'''
def __init__(self, key, keyFields, diffFields, fromRow, toRow):
self.key = key
self.diffFields = diffFields
self.fromRow = fromRow
self.toRow = toRow
self.__dict__.update(dict(zip(keyFields, key)))
self.__data = {}
if fromRow and toRow and len(fromRow) == 1 and len(toRow) == 1:
#extract the fields that are different between the two runs
for i, (f, t) in enumerate(zip(fromRow[0], toRow[0])):
if f != t:
self.__data[i] = f, t
def __eq__(self, other):
if isinstance(other, Result):
return self.key == other.key and self.__data == other.__data
if isinstance(other, tuple):
return self.key == other
if isinstance(other, dict):
return not any(self.__dict__[k] != other[k] for k in other.keys())
raise NotImplementedError
def __lt__(self, other):
if self == other:
return 0
if isinstance(other, Result):
def it():
yield self.key, other.key
for k in set(self.__data.keys()).union(other.__data.keys()):
yield self.__data.get(k), other.__data.get(k)
for s, o in it():
if s != o:
return tuple(sortKey(i) for i in s) < tuple(sortKey(i) for i in o)
return False
if isinstance(other, tuple):
return self.key < other
def comparable(self):
return bool(self.__data)
def __bool__(self):
return bool(self.__data or self.fromRow is None or self.toRow is None or len(self.fromRow) != len(self.toRow))
def __getitem__(self, field):
return self.__data[self.diffFields[field]]
def __contains__(self, field):
return self.diffFields[field] in self.__data
def __delitem__(self, field):
del self.__data[self.diffFields[field]]
def ignoreField(self, field):
if field in self:
del self.__data[self.diffFields[field]]
def checkRemove(self, field, filterMethod):
'''
remove the field from the result if filterMethod returns true for the fromRow, toRow pairs
field is the field to check
filterMethod is a method which takes two parameters (the fromRow and toRow versions of the field) and returns if they can be removed from the result
'''
fieldIdx = self.diffFields[field]
if fieldIdx in self.__data:
f, t = self.__data[fieldIdx]
if filterMethod(f, t):
del self.__data[fieldIdx]
def checkRemove_multiField(self, filterMethod, *fields):
'''
remove the set of fields from the result if filterMethod returns true for those entries
filterMethod is a method which takes two dicts: fromRow and toRow, with those fields specified by the fields parameter and returns if those values can be removed from the result
fields is a list of fields to check and possibly remove
'''
fieldIdxs = tuple((field, self.diffFields[field]) for field in fields)
if any(fieldIdx not in self.__data for field, fieldIdx in fieldIdxs):
return
fromRow, toRow = (AttributeDict((field, self.__data[fieldIdx][i]) for field, fieldIdx in fieldIdxs) for i in (0, 1))
if filterMethod(fromRow, toRow):
for field, fieldIdx in fieldIdxs:
del self.__data[fieldIdx]
def customCheck(self, keyFields, filterMethod, *fieldsToRemove):
'''
remove the set of fields from result if filterMethod returns true for the original fromRow and toRow pair
filterMethod is a method which takes two dicts: fromRow and toRow, with the data from the original from and to rows
fields is the list of fields to remove when filterMethod returns true
'''
if not (self.fromRow and self.toRow and len(self.fromRow) == 1 and len(self.toRow) == 1):
return
fromRow = self.originalFromRows(keyFields)[0]
toRow = self.originalToRows(keyFields)[0]
if filterMethod(fromRow, toRow):
for fieldIdx in tuple(self.diffFields[field] for field in fieldsToRemove):
if fieldIdx in self.__data:
del self.__data[fieldIdx]
def __repr__(self):
return 'Result(%s) # from rows: %d, to rows: %d' % (repr(self.key), len(self.fromRow) if self.fromRow else 0, len(self.toRow) if self.toRow else 0)
def __str__(self):
if self.__data:
return '%s\t\t%s' % (self.key, {field: self.__data[fieldIdx] for field, fieldIdx in self.diffFields.items() if fieldIdx in self.__data})
return '%s\tFrom: %s\tTo: %s' % (self.key, len(self.fromRow) if self.fromRow else 0, len(self.toRow) if self.toRow else 0)
def dataKeys(self):
return tuple(field for field, fieldIdx in self.diffFields.items() if fieldIdx in self.__data)
def getLengths(self):
return [len('%s' % k) for k in self.key]
def formatKeys(self, lengths):
return ', '.join(('% ' + str(l) + 's') % k for l, k in zip(lengths, self.key)) + ' |'
def originalFromRows(self, keyFields):
return [
AttributeDict(
{field: fromRow[fieldIdx]
for field, fieldIdx in self.diffFields.items()
}
) + dict(zip(keyFields, self.key)) for fromRow in (self.fromRow or [])
]
def originalToRows(self, keyFields):
return [
AttributeDict(
{field: toRow[fieldIdx]
for field, fieldIdx in self.diffFields.items()
}
) + dict(zip(keyFields, self.key)) for toRow in (self.toRow or [])
]
class ResultSet:
'''
ResultSet class representing the complete set of diff results.
Each bucket in either table is represented by a Result instance.
Provides filtering, iterating over the results and pretty-printing.
'''
def __init__(self, keyFields):
self.__data = defaultdict(lambda : [])
self.keyFields = keyFields
def __iadd__(self, result):
if isinstance(result, Result) and result:
self.__data[result.key].append(result)
return self
def filter(self, criteria):
newResults = ResultSet(self.keyFields)
for result in self:
if criteria(result):
newResults += result
return newResults
def __len__(self):
return len(self.__data)
def __iter__(self):
for rList in self.__data.values():
yield from rList
def __getitem__(self, key):
if key in self.__data:
return self.__data[key]
raise KeyError(key)
def __delitem__(self, key):
if isinstance(key, Result):
self.__data[key.key].remove(key)
if not self.__data[key.key]:
del self.__data[key.key]
else:
del self.__data[key]
def __repr__(self):
return 'ResultSet() # length: %d' % len(self.__data)
def __str__(self):
def tempIter():
yield 'Results:'
for result in self:
yield str(result)
return '\n'.join(tempIter())
def printFormatted(self):
for line in _formatResults(self):
print(line)
def maxKeyLengths(self):
candidates = [self.keyFields] + [result.key for result in self]
return [max(len('%s' % row[i]) for row in candidates) for i in range(len(self.keyFields))]
def formatKeyFields(self, lengths):
return ', '.join(('% ' + str(l) + 's') % k for l, k in zip(lengths, self.keyFields)) + ' |'
def pick(self):
'''Returns a (somewhat) random result object'''
return next(iter(self.__data.values()))[0]
def ignoreField(self, field):
for result in list(self):
result.ignoreField(field)
if not result:
del self[result]
def changedFields(self):
'''return the list of fields which changed'''
return sorted({h for result in self for h in result.dataKeys()})
def checkRemove(self, field, filterMethod):
'''
remove the field from each result if filterMethod returns true for the fromRow, toRow pairs. Removes any result which has no more inline differences
field is the field to check
filterMethod is a method which takes two parameters (the fromRow and toRow versions of the field) and returns if they can be removed from the result
'''
for result in list(self):
result.checkRemove(field, filterMethod)
if not result:
del self[result]
def checkRemove_multiField(self, filterMethod, *fields):
'''
remove the set of fields from each result if filterMethod returns true for those entries. Removes any result which has no more inline differences
filterMethod is a method which takes two dicts: fromRow and toRow, with those fields specified by the fields parameter and returns if those values can be removed from the result
fields is a list of fields to check and possibly remove
'''
for result in list(self):
result.checkRemove_multiField(filterMethod, *fields)
if not result:
del self[result]
def customCheck(self, filterMethod, *fieldsToRemove):
'''
remove the set of fields from each result if filterMethod returns true for the original fromRow and toRow pair. Removes any result with no more inline differences
filterMethod is a method which takes two dicts: fromRow and toRow, with the data from the original from and to rows
fields is the list of fields to remove when filterMethod returns true
'''
for result in list(self):
result.customCheck(self.keyFields, filterMethod, *fieldsToRemove)
if not result:
del self[result]
def originalFromRows(self):
'''return the original rows being diffed from'''
return DataTable(fromRow for result in self for fromRow in result.originalFromRows(self.keyFields))
def originalToRows(self):
'''return the original rows being diffed to'''
return DataTable(toRow for result in self for toRow in result.originalToRows(self.keyFields))
def _bucket(table, bucketHeaders, diffHeaders):
buckets = defaultdict(lambda : [])
for row in table:
key = tuple(row[h] for h in bucketHeaders)
value = tuple((row[h] if h in row else None) for h in diffHeaders)
buckets[key].append(value)
return buckets
def sortRowKey(row):
return tuple(sortKey(v) for v in row)
def diff(fromTable, toTable, *buckets):
'''The base diff method - buckets the data and ships it off to the Result and ResultSet classes to check for in-line differences'''
#split the data into buckets
fromBucketHeaders, toBucketHeaders = ([b for b in buckets if b in table.headers()] for table in (fromTable, toTable))
commonOtherHeaders = list(set(fromTable.headers()).intersection(toTable.headers()).difference(buckets))
fromOtherHeaders, toOtherHeaders = ([h for h in table.headers() if h not in bucketHeaders and h not in commonOtherHeaders] for table, bucketHeaders in ((fromTable, fromBucketHeaders), (toTable, toBucketHeaders)))
diffHeaders = {h: i for i, h in enumerate(commonOtherHeaders + fromOtherHeaders + toOtherHeaders)}
diffHeadersList = [None] * len(diffHeaders)
for h, i in diffHeaders.items():
diffHeadersList[i] = h
fromBuckets, toBuckets = (_bucket(table, bucketHeaders, diffHeadersList) for table, bucketHeaders in ((fromTable, fromBucketHeaders), (toTable, toBucketHeaders)))
allKeys = set(fromBuckets.keys()).union(toBuckets.keys())
results = ResultSet(buckets)
for key in allKeys:
if key in fromBuckets:
fromBucket = sorted(fromBuckets[key], key=sortRowKey)
else:
fromBucket = None
if key in toBuckets:
toBucket = sorted(toBuckets[key], key=sortRowKey)
else:
toBucket = None
if fromBucket and toBucket and len(fromBucket) == len(toBucket):
for fromRow, toRow in zip(fromBucket, toBucket):
results += Result(key, buckets, diffHeaders, [fromRow], [toRow])
else:
results += Result(key, buckets, diffHeaders, fromBucket, toBucket)
return results
def _formatResults(results):
'''Produce a pretty string for printing to the screen
format:
header line: "bucket", Field, , Field, ...
data lines: bucket, field_from, field_to, field_from, field_to...
'''
if not results:
yield 'No results to compare'
return
mismatch = sorted(result for result in results if result.fromRow is None or result.toRow is None or len(result.fromRow) != len(result.toRow))
keyMaxLengths = results.maxKeyLengths()
keyTotalSize = len(results.formatKeyFields(keyMaxLengths))
if mismatch:
yield "Buckets don't match number of rows:"
yield results.formatKeyFields(keyMaxLengths) + ' From Rows To Rows'
for result in mismatch:
yield result.formatKeys(keyMaxLengths) + ' %-12d %-12d' % (len(result.fromRow) if result.fromRow else 0, len(result.toRow) if result.toRow else 0)
results = results.filter(lambda result: result.fromRow and result.toRow and len(result.fromRow) == len(result.toRow))
if not results:
yield 'No inline differences'
return
yield 'Changes in common buckets:'
headers = results.changedFields()
resultList = []
maxLens = [keyTotalSize] + [0]*(len(headers)*2)
for i in range(len(headers)):
maxLens[i*2+1] = len(str(headers[i]))
for result in results:
buckets = (result.formatKeys(keyMaxLengths),)
for i, h in enumerate(headers):
if h in result:
maxLens[i*2+1] = max(maxLens[i*2+1], len(str(result[h][0])))
maxLens[i*2+2] = max(maxLens[i*2+2], len(str(result[h][1])))
buckets += result[h]
else:
buckets += '', ''
resultList.append(buckets)
maxLens = [str(m+1) for m in maxLens]
linePattern = '%-' + 's%-'.join(maxLens) + 's'
yield linePattern % ((results.formatKeyFields(keyMaxLengths),) + sum(((h,'') for h in headers), ()))
for result in resultList:
yield linePattern % result
def formatResults(results):
return '\n'.join(_formatResults(results))
def expectedChange(beforeValue, afterValue):
return lambda f, t: (f, t) == (beforeValue, afterValue)
def fromNothingToNothing(f, t):
return not (f or t)