def save_response(self, html_code, url, headers, crawl_date): # f*****g mess try: # just let the indexer save the files as normal and also create a Template url = url tree = makeTree(html_code, self.scraper.domain) if self.templates_done < self.scraper.config['max_templates']: self.templates_done += 1 self.scraper.domain_nodes_dict.add_template_elements(tree) self.scraper.url_to_headers_mapping[url] = headers self.data[url] = self.scraper.process(url, tree, False, ['cleaned']) self.data[url]['crawl_date'] = crawl_date scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time())) self.data[url]['scrape_date'] = scrape_date except Exception as e: LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r", url, str(e), traceback.format_exc()) return
def load_local_pages(self): saved_html_dir = os.path.join(self.collections_path, self.collection_name) for _, _, files in os.walk(saved_html_dir): for name in files: if name.startswith('.DS_'): continue with open(os.path.join(saved_html_dir, name)) as f: try: js = json.load(f) except: print('failed to load json {}'.format(name)) continue try: self.url_to_tree_mapping[js['url']] = makeTree( js['html'], self.domain) self.url_to_headers_mapping[js['url']] = js['headers'] # pylint: disable=broad-except except Exception as e: print(str(e))