def read_tag(self, element, prev_url): """ This method get all urls from <a> tags and all user readable text, from DOM tree Args: element (DOM element): can get from DOM tree with getElementsByTagName prev_url (str): url of current page, need for transforming relative urls to absolute_ """ if element.nodeType == element.TEXT_NODE: return (None, element.data.strip()) elif (element.nodeType == element.ELEMENT_NODE and element.tagName not in UNACCEPTABLE_HTML_TAGS): if element.tagName == "a": if element.hasAttribute("href"): norm = normalize_url(element.getAttribute("href"), prev_url) if norm: return ([norm], None) else: return ([], None) else: return (None, None) elif element.hasChildNodes(): result_text = "" urls = [] for child in element.childNodes: links, text = self.read_tag(child, prev_url) if text: result_text += " " + text if links: for i in links: urls.append(i) return (urls, result_text) else: return (None, None) else: return (None, None)
def add_url(self, url, referer=None): norm_url = normalize_url(url, referer) if norm_url: self.passed.add(hash(url)) self.queue.put((url, referer))