def get_dom_list(self, configuration): #save dom of iframe in list of StateDom [iframe_path_list, dom, url/src, normalize dom] dom_list = [] new_dom = self.get_source() url = self.get_url() soup = BeautifulSoup(new_dom, 'html5lib') for frame in configuration.get_frame_tags(): for iframe_tag in soup.find_all(frame): iframe_xpath = DomAnalyzer._get_xpath(iframe_tag) iframe_src = iframe_tag['src'] if iframe_tag.has_attr('src') else None if configuration.is_dom_inside_iframe() and iframe_src and self.is_same_domain( configuration, iframe_src ): try: #not knowing what error in iframe_tag.clear(): no src print( '_1:',iframe_xpath,' : ',iframe_src ) self.get_dom_of_iframe(configuration, dom_list, [iframe_xpath], iframe_src) iframe_tag.clear() except Exception as e: logging.error(' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ', str(e)) dom_list.append( { 'url' : url, 'dom' : str(soup.prettify()), 'iframe_path' : None, } ) return dom_list, url
def get_dom_of_iframe(self, configuration, dom_list, iframe_xpath_list, src): dom = self.switch_iframe_and_get_source(iframe_xpath_list) soup = BeautifulSoup(dom, 'html5lib') for frame in configuration.get_frame_tags(): for iframe_tag in soup.find_all(frame): iframe_xpath = DomAnalyzer._get_xpath(iframe_tag) iframe_xpath_list.append(iframe_xpath) iframe_src = iframe_tag['src'] if iframe_tag.has_attr( 'src') else None if iframe_src and self.is_same_domain(configuration, iframe_src): try: print('_2:', iframe_xpath, ' : ', iframe_src) self.get_dom_of_iframe(configuration, dom_list, iframe_xpath_list, iframe_src) iframe_tag.clear() except Exception as e: logging.error( ' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ', str(e)) dom_list.append({ 'url': src, 'dom': str(soup.prettify()), 'iframe_path': iframe_xpath_list, })
def get_dom_list(self, configuration): #save dom of iframe in list of StateDom [iframe_path_list, dom, url/src, normalize dom] dom_list = [] new_dom = self.switch_iframe_and_get_source() url = self.get_url() soup = BeautifulSoup(new_dom, 'html5lib') for frame in configuration.get_frame_tags(): for iframe_tag in soup.find_all(frame): iframe_xpath = DomAnalyzer._get_xpath(iframe_tag) iframe_src = iframe_tag['src'] if iframe_tag.has_attr( 'src') else None try: #not knowing what error in iframe_tag.clear(): no src if configuration.is_dom_inside_iframe(): self.get_dom_of_iframe(configuration, dom_list, [iframe_xpath], iframe_src) iframe_tag.clear() except Exception as e: logging.error( ' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ', str(e)) dom_list.append({ 'url': url, 'dom': str(soup), 'iframe_path': None, }) brID = self.browserID return dom_list, url
def test_get_xpath(self): html_doc = ''' <html><body> <div></div> <div> <form> <input><button></button> </form> </div> </body></html> ''' soup = BeautifulSoup(html_doc, 'html.parser') form = soup.find('form') self.assertEqual(DomAnalyzer._get_xpath(form), '//html/body/div[2]/form[1]')
def get_dom_of_iframe(self, configuration, dom_list, iframe_xpath_list, src): dom = self.switch_iframe_and_get_source(iframe_xpath_list) soup = BeautifulSoup(dom, 'html5lib') for frame in configuration.get_frame_tags(): for iframe_tag in soup.find_all(frame): iframe_xpath = DomAnalyzer._get_xpath(iframe_tag) iframe_xpath_list.append(iframe_xpath) iframe_src = iframe_tag['src'] if iframe_tag.has_attr('src') else None try: self.get_dom_of_iframe(configuration, dom_list, iframe_xpath_list, iframe_src) iframe_tag.clear() except Exception as e: logging.error(' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ', str(e)) dom_list.append( { 'url' : src, 'dom' : str(soup), 'iframe_path' : iframe_xpath_list, } )