class Crawler: """Crawler""" def __init__(self) -> None: self.downloader = Downloader() self.parser = Parser() def scrape(self, keyword: Optional[str] = None) -> bool: """scrape Summary: summary Args: keyword: Optional[str] Returns: {variable}: bool Raises: """ if keyword: html = self.downloader.get_text_from_keyword(keyword) if html: self.parser.print_result_from_parser(html) return True return False def change_url(self): """change_url"""
class DownloaderTestCase(unittest.TestCase): def setUp(self): self.downloader = Downloader() self.keyword = '手機' self.response = self.downloader.request(self.keyword) def test_request(self): self.assertEqual(200, self.response.status_code) def test_get_response_text(self): html = self.downloader.get_response_text(self.response) self.assertTrue(html) def test_get_text_from_keyword(self): html = self.downloader.get_text_from_keyword(self.keyword) self.assertTrue(html)
class Spider(): def __init__(self): self.downloader = Downloader() self.processor = Processor() self.pipeline = Pipeline() self.schedule = Schedule() def set_downloader(self, downloader): self.downloader = downloader def set_processor(self, processor): self.processor = processor def set_pipeline(self, pipeline): self.pipeline = pipeline def set_schedule(self, schedule): self.schedule = schedule def run(self): while (True): request = self.schedule.pop() if request is None: print('等待 5 秒') time.sleep(5) continue print('pop url :' + request.url) page = self.downloader.download(request) if (not page.download_success()): self.schedule.push(request) continue self.processor.process(page) self.add_target_url(page) if (page.is_skip): continue # 持久化 self.pipeline.data_persistent(page.get_item()) print('等待 5 秒') time.sleep(5) def add_target_url(self, page): for request in page.get_target_requests(): self.schedule.push(request) print('add target url : ' + request.url) def add_start_url(self, request): if (isinstance(request, Request)): self.schedule.push(request) elif (isinstance(request, str)): self.schedule.push(Request(request))
def setUp(self): self.downloader = Downloader() self.keyword = '手機' self.response = self.downloader.request(self.keyword)
def __init__(self, param: str, downloader=Downloader(), **kwargs): super().__init__( downloader, urljoin(config.get("default", "default_url"), param), name=kwargs["name"], )
def crawl(self, download_searches=True, download_offers=True, remove_files=False, start_page=1, end_page=30, rent=True): if remove_files: log.info("Removing files") filer = Filer(self.download_path_offers) filer.empty_dir() filer = Filer(self.download_path_searches) filer.empty_dir() if download_searches: log.info("Downloading files") for service in self.services: d = Downloader(self.download_path_searches, self.download_path_offers, service=service, city=self.city, property_type=self.property_type, rent=rent) d.download_main_pages(start_page, end_page) """ Get all links to offers """ filer_searches = Filer(self.download_path_searches) all_offers = {} for service in self.services: if service not in all_offers: all_offers[service] = [] for file in filer_searches.get_all_files(): with open("{}/{}".format(self.download_path_searches, file), "r", encoding="utf-8") as f: scraper = Scraper(f.read(), service) for link in scraper.get_search_results(): all_offers[service].append(link) log.debug("Links to offers:\t{}".format(len(all_offers))) """ Download offers """ if download_offers: for service in self.services: d = Downloader(self.download_path_searches, self.download_path_offers, service=service, city=self.city) progress = 0 for url in all_offers[service]: if d.download_offer_page(url): progress += 1 log.info("Downloaded [{}]:\t{}/{}".format( service, progress, len(all_offers[service]))) filer = Filer(self.download_path_offers) all_files = filer.get_all_files() """ Add offers to DB """ counter = 0 for file in all_files: if counter % 50 == 0: # commit records in DB every 50 offers session.commit() params = {} with open("{}/{}".format(self.download_path_offers, file), "r", encoding="utf-8") as f: if "gratka" in file: log.debug("Parsing gratka file:\t{}".format(file)) parser = ParserGratka(f.read()) params = parser.parse_site() elif "otodom" in file: log.debug("Parsing otodom file:\t{}".format(file)) parser = ParserOtodom(f.read()) params = parser.parse_site() if "offer_id" in params: instance = session.query(Offer).filter( Offer.offer_id == params['offer_id']).first() if not instance: log.info("Adding offer:\t{}\t{}/{}".format( params['offer_id'], counter, len(all_files))) session.add(Offer(**params)) counter += 1 session.commit()
def __init__(self) -> None: self.downloader = Downloader() self.parser = Parser()
def __init__(self): self.downloader = Downloader() self.processor = Processor() self.pipeline = Pipeline() self.schedule = Schedule()
def crawl(self): for start_link in self.starting_urls: downloader = Downloader(start_link) app = downloader.get_app_from_link() self.links_visited.add(start_link) self.apps.append(app) self.depth_links.append(app.in_links) self.depth_links.append(app.out_links) with open(str('./resources/jsonFiles/' + 'item_pipeline_' + 0 + '_' + app.uid + '.json'), 'w') as outfile: json.dump(app.__dict__, outfile) self.num_docs_crawled = 1 while self.num_docs_crawled < self.num_docs_to_be_crawled: current_in_links = [] current_out_links = [] count = 0 for link in self.depth_links[self.current_depth]: if link not in self.links_visited and count < self.in_degree : current_app = Downloader(link).get_app_from_link() if current_app is 0: continue current_in_links.extend(current_app.in_links) current_out_links.extend(current_app.out_links) with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile: json.dump(current_app.__dict__, outfile) update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled) self.num_docs_crawled += 1 self.apps.append(current_app) self.links_visited.add(link) count += 1 self.depth_links.append(current_in_links) self.depth_links.append(current_out_links) self.current_depth += 1 current_in_links = [] current_out_links = [] count = 0 for link in self.depth_links[self.current_depth]: if link not in self.links_visited and count < self.out_degree: current_app = Downloader(link).get_app_from_link() if current_app is 0: continue current_in_links.extend(current_app.in_links) current_out_links.extend(current_app.out_links) with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile: json.dump(current_app.__dict__, outfile) update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled) self.num_docs_crawled += 1 self.apps.append(current_app) self.links_visited.add(link) count += 1 self.current_depth += 1 self.depth_links.append(current_in_links) self.depth_links.append(current_out_links)