Exemplo n.º 1
0
    def get_dom_list(self, configuration):
        #save dom of iframe in list of StateDom [iframe_path_list, dom, url/src, normalize dom]
        dom_list = []
        new_dom = self.get_source()
        url = self.get_url()
        soup = BeautifulSoup(new_dom, 'html5lib')
        for frame in configuration.get_frame_tags():
            for iframe_tag in soup.find_all(frame):
                iframe_xpath = DomAnalyzer._get_xpath(iframe_tag)
                iframe_src = iframe_tag['src'] if iframe_tag.has_attr('src') else None
                if configuration.is_dom_inside_iframe() and iframe_src and self.is_same_domain( configuration, iframe_src ):
                    try: #not knowing what error in iframe_tag.clear(): no src
                        print( '_1:',iframe_xpath,'  : ',iframe_src )
                        self.get_dom_of_iframe(configuration, dom_list, [iframe_xpath], iframe_src)
                        iframe_tag.clear()
                    except Exception as e:
                        logging.error(' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ', str(e))

        dom_list.append( {
                'url' : url,
                'dom' : str(soup.prettify()),
                'iframe_path' : None,
            } )

        return dom_list, url
Exemplo n.º 2
0
    def get_dom_of_iframe(self, configuration, dom_list, iframe_xpath_list,
                          src):
        dom = self.switch_iframe_and_get_source(iframe_xpath_list)
        soup = BeautifulSoup(dom, 'html5lib')
        for frame in configuration.get_frame_tags():
            for iframe_tag in soup.find_all(frame):
                iframe_xpath = DomAnalyzer._get_xpath(iframe_tag)
                iframe_xpath_list.append(iframe_xpath)
                iframe_src = iframe_tag['src'] if iframe_tag.has_attr(
                    'src') else None
                if iframe_src and self.is_same_domain(configuration,
                                                      iframe_src):
                    try:
                        print('_2:', iframe_xpath, '  : ', iframe_src)
                        self.get_dom_of_iframe(configuration, dom_list,
                                               iframe_xpath_list, iframe_src)
                        iframe_tag.clear()
                    except Exception as e:
                        logging.error(
                            ' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ',
                            str(e))

        dom_list.append({
            'url': src,
            'dom': str(soup.prettify()),
            'iframe_path': iframe_xpath_list,
        })
Exemplo n.º 3
0
    def get_dom_list(self, configuration):
        #save dom of iframe in list of StateDom [iframe_path_list, dom, url/src, normalize dom]
        dom_list = []
        new_dom = self.switch_iframe_and_get_source()

        url = self.get_url()
        soup = BeautifulSoup(new_dom, 'html5lib')
        for frame in configuration.get_frame_tags():
            for iframe_tag in soup.find_all(frame):
                iframe_xpath = DomAnalyzer._get_xpath(iframe_tag)
                iframe_src = iframe_tag['src'] if iframe_tag.has_attr(
                    'src') else None
                try:  #not knowing what error in iframe_tag.clear(): no src
                    if configuration.is_dom_inside_iframe():
                        self.get_dom_of_iframe(configuration, dom_list,
                                               [iframe_xpath], iframe_src)
                    iframe_tag.clear()
                except Exception as e:
                    logging.error(
                        ' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ',
                        str(e))
        dom_list.append({
            'url': url,
            'dom': str(soup),
            'iframe_path': None,
        })
        brID = self.browserID

        return dom_list, url
Exemplo n.º 4
0
 def test_get_xpath(self):
     html_doc = '''
     <html><body>
       <div></div>
       <div>
         <form>
           <input><button></button>
         </form>
       </div>
     </body></html>
     '''
     soup = BeautifulSoup(html_doc, 'html.parser')
     form = soup.find('form')
     self.assertEqual(DomAnalyzer._get_xpath(form), '//html/body/div[2]/form[1]')
Exemplo n.º 5
0
 def get_dom_of_iframe(self, configuration, dom_list, iframe_xpath_list, src):
     dom = self.switch_iframe_and_get_source(iframe_xpath_list)
     soup = BeautifulSoup(dom, 'html5lib')
     for frame in configuration.get_frame_tags():
         for iframe_tag in soup.find_all(frame):
             iframe_xpath = DomAnalyzer._get_xpath(iframe_tag)
             iframe_xpath_list.append(iframe_xpath)
             iframe_src = iframe_tag['src'] if iframe_tag.has_attr('src') else None
             try:
                 self.get_dom_of_iframe(configuration, dom_list, iframe_xpath_list, iframe_src)      
                 iframe_tag.clear()
             except Exception as e:
                 logging.error(' get_dom_of_iframe: %s \t\t__from crawler.py get_dom_list() ', str(e))
     dom_list.append( {
             'url' : src,
             'dom' : str(soup),
             'iframe_path' : iframe_xpath_list,
         } )