def examples_to_nodes(field_list, page_url, text_filter_func=text_filter_strip_newline, user_agent=DEFAULT_USER_AGENT): """ For each field, set the selected node. """ root = url_to_DOM(page_url, user_agent) d = {} for field in field_list: target_nodes = [ node for node in root.iterdescendants() \ if text_filter_func(node.text_content()) == text_filter_func(field.example) ] d[field.name] = [] for node in target_nodes: xpath = node_to_absolute_XPATH(node) record_count = len(root.findall(xpath)) d[field.name].append((node, xpath, record_count)) return d
def example_to_node(field, page_url, filter=False, disambiguation_method=take_last, text_filter_func=text_filter_strip_newline): """ This method takes in an ExampleField and finds the target node that contains that example. field: An ExampleField. page_url: The URL of the selected page. filter: If text_filter_func shold be applied to text before comparison. disambiguation_method: This should be a method that takes in a list and somehow decides which node to return. text_filter_func: Allows the user to supply a function for filtering text. default: util.text_filter_strip_newline (removes surrounding whitespace and replaces newlines with '' """ root = url_to_DOM(page_url) if filter: target_nodes = [ node for node in root.iterdescendants() if text_filter_func(node.text_content()) == text_filter_func(field.example) ] else: target_nodes = [ node for node in root.iterdescendants() if node.text_content() == field.example ] if len(target_nodes) == 0: raise ValueError('Node containing the given example not found. Field(%s)' % (field.name)) else: return disambiguation_method(target_nodes)