def url_manage_proc(self, url_q, conn_q, root_url, page_num): url_manager = UrlManager() url_manager.add_new_url(root_url) print('url_mannager is working...') while True: while url_manager.has_new_url(): # 从URL管理器获取新的URL new_url = url_manager.get_new_url() # 将新的URL发到工作节点 url_q.put(new_url) # 加上判断, 爬满2000个链接终止爬虫并保存进度 if (url_manager.old_urls_size() > page_num): # 通知爬虫节点结束工作 url_q.put('end') print('控制节点发起结束通知!') # 关闭节点同事存储状态 url_manager.save_process('new_urls.txt', url_manager.new_urls) url_manager.save_process('old_urls.txt', url_manager.old_urls) return # 从result_solve_proc获取的URL添加到URL管理器 print('url control working..., solve result') try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except Exception as e: time.sleep(1) # 延时休息 print('has crawl page num : ', url_manager.old_urls_size()) time.sleep(5)
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() # url_manager.add_new_url(root_url) while True: while (url_manager.has_new_url()): # 从URL管理器获取新的url new_url = url_manager.get_new_url() print(new_url) # 将新的URL发给工作节点 url_q.put(new_url) print('old_url=', url_manager.old_url_size()) # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度 if (url_manager.old_url_size() > 2000): # 通知爬行节点工作结束 url_q.put('end') print('控制节点发起结束通知!') # 关闭管理节点,同时存储set状态 url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return # 将从result_solve_proc获取到的urls添加到URL管理器之间 try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1) # 延时休息
class SpiderMan(): def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self,root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url =self.manager.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print("已经抓取%s个链接"%self.manager.old_url_size()) except Exception as e: print('crawl failed') self.output.output_html()
class Spiderman(object): def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput() def crawl(self,root_url): self.manage.add_new_url(root_url) print(len(self.manage.new_urls)) while(self.manage.has_new_url() and self.manage.old_url_size() < 100): try: new_url = self.manage.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manage.add_new_urls(new_urls) self.output.store_data(data=data) print('已经抓取%s个链接' % self.manage.old_url_size()) except: print('crawl Failed') self.output.output_html()
def url_manager_proc(self, url_q, conn_q, root_url): url_manager = UrlManager() url_manager.add_new_url(root_url) while True: if url_manager.has_new_url(): new_url = url_manager.get_new_url() url_q.put(new_url) print('old_url=', url_manager.old_url_size()) if url_manager.old_url_size() > 2000: url_q.put('end') print('Manager notify ending!') url_manager.save_progress('new_urls.txt', url_manager.new_urls) url_manager.save_progress('old_urls.txt', url_manager.old_urls) return try: if not conn_q.empty(): urls = conn_q.get() url_manager.add_new_urls(urls) except BaseException as e: time.sleep(0.1)
class Anjuke(object): def __init__(self, ): self.count = 0 self.wcount = 0 self.mylock = Lock() self.csvfile = file('sz.csv', 'a') #ks.csv self.csvfile.write(codecs.BOM_UTF8) self.item_queue = Queue() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36' } self.proxies = { 'http': 'https://121.61.0.33:9999', "https": 'https://121.61.0.33:9999' } self.mysign = True #https://suzhou.anjuke.com/community self.rawurl = 'https://suzhou.anjuke.com/community/' self.urlmanager = UrlManager() def get_villages(self): rep = requests.get(self.rawurl, headers=self.headers, verify=False, timeout=2) soup = BeautifulSoup(rep.text, 'lxml') results = soup.find_all('span', attrs={'class': 'elems-l'}) items = results[0].find_all('a') for item in items: if item.get("title") == "全部小区": print '剔除(全部)这一选项' continue vurl = item.get('href') print vurl self.get_villages2(vurl) #self.get_villages3(vurl) print '======================================' self.urlmanager.save_urls_process_status(self.urlmanager.new_urls, r'new_urls.txt') self.urlmanager.save_urls_process_status(self.urlmanager.crawled_urls, r'crawled_urls.txt') def get_villages2(self, vurl): rep = requests.get(vurl, headers=self.headers, verify=False, timeout=2) soup = BeautifulSoup(rep.text, 'lxml') results = soup.find_all('div', attrs={'class': 'sub-items'}) items = results[0].find_all('a') for item in items: if item.get("title") == "全部小区": print '剔除(全部)这一选项2' continue url2 = item.get('href') print url2 self.get_villages3(url2) def get_villages3(self, url): while 1: rep = requests.get(url, headers=self.headers, verify=False, timeout=2) soup = BeautifulSoup(rep.text, 'lxml') results = soup.find_all('div', attrs={'_soj': 'xqlb'}) for result in results: item_url = result.get('link') self.count += 1 print 'No.', self.count, ':', item_url self.urlmanager.add_new_url(item_url) next_item = soup.find('a', attrs={'class': 'aNxt'}) if next_item == None: break else: url = next_item.get('href') # time.sleep(1) def get_detail(self, c_url): rlist = [] # item ={} try: rep = requests.get(c_url, headers=self.headers, verify=False, timeout=4) except: print 'current:urls num2:', self.urlmanager.new_urls_size() self.urlmanager.readd_new_url(c_url) return if rep.url.startswith('https://www.anjuke.com/captcha-verify/'): self.urlmanager.readd_new_url(c_url) self.urlmanager.save_urls_process_status(self.urlmanager.new_urls, r'new_urls.txt') self.urlmanager.save_urls_process_status( self.urlmanager.crawled_urls, r'crawled_urls.txt') return print rep.url soup = BeautifulSoup(rep.text, 'lxml') name = soup.find('h1') addr = name.find('span') print name.contents[0].strip(), addr.string rlist.append(name.contents[0].strip().decode("utf-8")) rlist.append(addr.string.decode("utf-8")) village = re.search("(.*?)-.*?", addr.string).group(1) try: y, x = geocodeG(addr.string.replace(village, '苏州')) #昆山 except: try: y, x = geocodeG(name.contents[0].strip().decode("utf-8")) except: y, x = geocodeG(u'苏州' + village) #昆山 rlist.append(y) rlist.append(x) price = re.search('.*comm_midprice":"(.*?)"', rep.text) if price == None: price = u'暂无报价' else: price = price.group(1) print 'price:', price rlist.append(price) # item['price'] = price.group(1) result = soup.find('dl', attrs={"class": 'basic-parms-mod'}) for a in result.find_all('dd'): # print a.string.strip().decode('utf-8') rlist.append(a.string.strip().decode('utf-8')) print '----' id = re.search('view/(\d+)', rep.url) rent_url = 'https://ks.anjuke.com/v3/ajax/communityext/?commid=' + str( id.group(1)) + '&useflg=onlyForAjax' print rent_url response = requests.get(rent_url, headers=self.headers, verify=False, timeout=2) print response.text content = json.loads(response.text) print content.get('comm_propnum').get('rentNum'), content.get( 'comm_propnum').get('saleNum') rlist.append(content.get('comm_propnum').get('rentNum')) rlist.append(content.get('comm_propnum').get('saleNum')) # self.item_queue.put(rlist) return rlist # item['property-type'] = value_list[0].string.strip().replace(":",'') # item['property-cost'] = value_list[1].string.strip().replace(":",'') # item['area'] = value_list[2].string.strip().replace(":",'') # item['households'] = value_list[3].string.strip().replace(":",'') # item['build-years'] = value_list[4].string.strip().replace(":",'') # item['parking-nums'] = value_list[5].string.strip().replace(":",'') # item['cap-rate'] = value_list[6].string.strip().replace(":",'') # item['greeening-rate'] = value_list[7].string.strip().replace(":",'') # item['developer'] = value_list[8].string.strip().replace(":",'') # item['property-management'] = value_list[9].string.strip().replace(":",'') # print j.string.strip().replace(":",'') # for (k,v) in item.items(): # print "dict[%s]=" % k,v def write_to_csv(self, item): csv_write = unicodecsv.writer(self.csvfile, encoding='utf-8-sig', dialect='excel') csv_write.writerow(item) def write_to_csv2(self): if not self.item_queue.empty(): self.mylock.acquire(10) with open('ks.csv', 'a') as csvfile: item = self.item_queue.get() csv_write = unicodecsv.writer(csvfile, encoding='utf-8-sig', dialect='excel') csv_write.writerows(item) self.mylock.release() def write_to_csv3(self): while not self.item_queue.empty(): item = self.item_queue.get() csv_write = unicodecsv.writer(self.csvfile, encoding='utf-8-sig', dialect='excel') self.wcount += 1 print 'write No.', self.wcount, 'url' csv_write.writerow(item) def start2(self): num = 0 self.get_villages() print 'current:urls num1:', self.urlmanager.new_urls_size() while self.urlmanager.has_new_url(): num += 1 new_url = self.urlmanager.get_new_url() try: print 'get No.', num, 'url' url_process = Process(target=self.get_detail, args=(new_url, self.item_queue)) url_process.start() except: with open("anjuke.log", 'w+') as f: f.write('current:urls num2:') f.write(str(self.urlmanager.new_urls_size())) self.urlmanager.readd_new_url(new_url) self.urlmanager.save_urls_process_status( self.urlmanager.new_urls, r'new_urls.txt') self.urlmanager.save_urls_process_status( self.urlmanager.crawled_urls, r'crawled_urls.txt') while not self.item_queue.empty(): print 'write No.', num, 'url' self.write_to_csv(item) # write_process = Process(target=self.write_to_csv2) # write_process.start() def start3(self): # self.get_villages() fo = open("anjuke.log", 'w+') s = sys.stdout sys.stdout = fo print 'current:urls num1:', self.urlmanager.new_urls_size() while self.urlmanager.has_new_url(): newlist = [] flag = 100 while flag: if self.urlmanager.has_new_url(): new_url = self.urlmanager.get_new_url() newlist.append(new_url) flag -= 1 else: break pool = threadpool.ThreadPool(12) requests = threadpool.makeRequests(self.get_detail, newlist) [pool.putRequest(req) for req in requests] pool.wait() self.urlmanager.save_urls_process_status(self.urlmanager.new_urls, r'new_urls.txt') self.urlmanager.save_urls_process_status( self.urlmanager.crawled_urls, r'crawled_urls.txt') self.write_to_csv3() sys.stdout = s def start(self): fo = open("anjuke.log", 'w+') s = sys.stdout sys.stdout = fo num = 0 # self.get_villages() print 'current:urls num1:', self.urlmanager.new_urls_size() while self.urlmanager.has_new_url(): new_url = self.urlmanager.get_new_url() self.urlmanager.save_urls_process_status(self.urlmanager.new_urls, r'new_urls.txt') self.urlmanager.save_urls_process_status( self.urlmanager.crawled_urls, r'crawled_urls.txt') try: item = self.get_detail(new_url) num += 1 print 'write No.', num, 'url' self.write_to_csv(item) except: print 'current:urls num2:', self.urlmanager.new_urls_size() self.urlmanager.readd_new_url(new_url) self.urlmanager.save_urls_process_status( self.urlmanager.new_urls, r'new_urls.txt') self.urlmanager.save_urls_process_status( self.urlmanager.crawled_urls, r'crawled_urls.txt') sys.stdout = s