예제 #1
0
파일: crawling.py 프로젝트: bgarrels/sky
 def save_response(self, html_code, url, headers, crawl_date):
     # f*****g mess
     try:
         # just let the indexer save the files as normal and also create a Template
         url = url
         tree = makeTree(html_code, self.scraper.domain)
         if self.templates_done < self.scraper.config['max_templates']:
             self.templates_done += 1
             self.scraper.domain_nodes_dict.add_template_elements(tree)
             self.scraper.url_to_headers_mapping[url] = headers
         self.data[url] = self.scraper.process(url, tree, False, ['cleaned'])
         self.data[url]['crawl_date'] = crawl_date
         scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time()))
         self.data[url]['scrape_date'] = scrape_date
     except Exception as e:
         LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r",
                      url, str(e), traceback.format_exc())
     return
예제 #2
0
 def load_local_pages(self):
     saved_html_dir = os.path.join(self.collections_path, self.collection_name)
     for _, _, files in os.walk(saved_html_dir):
         for name in files:
             if name.startswith('.DS_'):
                 continue
             with open(os.path.join(saved_html_dir, name)) as f:
                 try:
                     js = json.load(f)
                 except:
                     print('failed to load json {}'.format(name))
                     continue
                 try:
                     self.url_to_tree_mapping[js['url']] = makeTree(
                         js['html'], self.domain)
                     self.url_to_headers_mapping[js['url']] = js['headers']
                 # pylint: disable=broad-except
                 except Exception as e:
                     print(str(e))
예제 #3
0
 def save_response(self, html_code, url, headers, crawl_date):
     # f*****g mess
     try:
         # just let the indexer save the files as normal and also create a Template
         url = url
         tree = makeTree(html_code, self.scraper.domain)
         if self.templates_done < self.scraper.config['max_templates']:
             self.templates_done += 1
             self.scraper.domain_nodes_dict.add_template_elements(tree)
             self.scraper.url_to_headers_mapping[url] = headers
         self.data[url] = self.scraper.process(url, tree, False,
                                               ['cleaned'])
         self.data[url]['crawl_date'] = crawl_date
         scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S',
                                     time.localtime(time.time()))
         self.data[url]['scrape_date'] = scrape_date
     except Exception as e:
         LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r",
                      url, str(e), traceback.format_exc())
     return