def tree_diff_children(list1, list2, hash_func, algorithm): # list1 and list2 are lists of etree Elements. if list1 == list2 == []: return [] # Try to find the longest common substring, according to hash_func(). # First we use element_hash_strict(), but then we use element_hash_loose() # as a fallback. best_size, offset1, offset2 = longest_common_substring([hash_func(el) for el in list1], [hash_func(el) for el in list2]) result = [] if best_size == 0: if hash_func == element_hash_strict: result.extend(tree_diff_children(list1, list2, element_hash_loose, algorithm)) else: result.append(etree.Element('MULTITAG_HOLE')) if offset1 > 0 and offset2 > 0: # There's leftover stuff on the left side of BOTH lists. result.extend(tree_diff_children(list1[:offset1], list2[:offset2], element_hash_strict, algorithm)) elif offset1 > 0 or offset2 > 0: # There's leftover stuff on the left side of ONLY ONE of the lists. result.append(etree.Element('MULTITAG_HOLE')) if best_size > 0: for i in range(best_size): child = tree_diff(list1[offset1+i], list2[offset2+i], algorithm) result.append(child) if (offset1 + best_size < len(list1)) and (offset2 + best_size < len(list2)): # There's leftover stuff on the right side of BOTH lists. result.extend(tree_diff_children(list1[offset1+best_size:], list2[offset2+best_size:], element_hash_strict, algorithm)) elif (offset1 + best_size < len(list1)) or (offset2 + best_size < len(list2)): # There's leftover stuff on the right side of ONLY ONE of the lists. result.append(etree.Element('MULTITAG_HOLE')) return result
def tree_extract_children(list1, list2, hash_func, algorithm): # list1 and list2 are lists of etree Elements. if list1 == list2 == []: return [] best_size, offset1, offset2 = longest_common_substring([hash_func(el) for el in list1], [hash_func(el) for el in list2]) result = [] if best_size == 0: if [el.tag for el in list1] == ['MULTITAG_HOLE']: data = ''.join([etree.tostring(child, method='html') for child in list2]) result.append({'type': 'multitag', 'value': data, 'tag': None}) elif hash_func == element_hash_strict: result.extend(tree_extract_children(list1, list2, element_hash_loose, algorithm)) else: raise NoMatch('Brain tag had children %r, but sample had %r' % (list1, list2)) if offset1 > 0 and offset2 > 0: # There's leftover stuff on the left side of BOTH lists. result.extend(tree_extract_children(list1[:offset1], list2[:offset2], element_hash_strict, algorithm)) elif offset1 > 0: # There's leftover stuff on the left side of ONLY the brain. if [el.tag for el in list1[:offset1]] == ['MULTITAG_HOLE']: result.append({'type': 'multitag', 'value': '', 'tag': None}) else: raise NoMatch('Brain tag had children %r, but sample had %r' % (list1[:offset1], list2)) elif offset2 > 0: # There's leftover stuff on the left side of ONLY the sample. raise NoMatch('Brain tag had children %r, but sample had %r' % (list1, list2)) if best_size > 0: for i in range(best_size): child_result = tree_extract(list1[offset1+i], list2[offset2+i], algorithm) result.extend(child_result) if (offset1 + best_size < len(list1)) or (offset2 + best_size < len(list2)): # There's leftover stuff on the right side of EITHER list. child_result = tree_extract_children(list1[offset1+best_size:], list2[offset2+best_size:], element_hash_strict, algorithm) result.extend(child_result) return result
def LCS(self, seq1, seq2): return longest_common_substring(seq1, seq2)