def site_spider(self): entry_urls_list = [] for site in self.site_list: entry_urls = [site] entry_urls.extend(self.search_engines_result.get(site, [])) entry_urls_list.append(entry_urls) site_spider_result = services.site_spider_thread(entry_urls_list) for site in site_spider_result: target_urls = site_spider_result[site] new_target_urls = [] for url in target_urls: if url in self.page_url_list: continue new_target_urls.append(url) self.page_url_list.append(url) page_map = services.page_fetch(new_target_urls) for url in page_map: item = { "site": site, "task_id": self.task_id, "source": CollectSource.SITESPIDER } item.update(page_map[url]) domain_parsed = utils.domain_parsed(site) if domain_parsed: item["fld"] = domain_parsed["fld"] utils.conn_db('url').insert_one(item)
def site_spider(self): entry_urls_list = [] for site in self.site_list: entry_urls_list.append([site]) site_spider_result = services.site_spider_thread(entry_urls_list) for site in site_spider_result: target_urls = [] target_urls.extend(site_spider_result[site]) if not target_urls: continue page_map = services.page_fetch(target_urls) for url in page_map: item = { "site": site, "task_id": self.task_id, "source": CollectSource.SITESPIDER } item.update(page_map[url]) utils.conn_db('url').insert_one(item)