def get_proxy_generator(filename): if not filename: # 从网站爬取新的ip保存起来 proxy.get_proxies() # 读取结果并将结果以生成器返回 with open('proxies.txt', 'r', encoding='utf-8') as f: for line in f: yield line.strip() else: with open(filename, 'r', encoding='utf-8') as f: for line in f: yield line.strip()
def scrape_endpoint(endpoint): timeout = 5 counter = 0 proxies = proxy.get_proxies() URL = settings.PRODUCTION_URL + "/" + endpoint CLEAR_URL = settings.PRODUCTION_URL + "/clear_address" print("Scraping the %s endpoint with ip rotation: %s " % (endpoint, URL)) try: while True: print("Request", counter) curr_proxy = proxy.create_proxy_dict(random.choice(proxies)) print("Proxy: ", curr_proxy) response = requests.get(URL, proxies=curr_proxy) status = json.loads(response.content).get("status", 200) if status == 429: raise ValueError("Rate Limited after %s requests" % counter) if status == 403: raise ValueError("Blacklisted after %s requests" % counter) counter += 1 except requests.Timeout: print("Timed out after", counter, "requests") except Exception as e: import traceback traceback.print_exc() print(e) requests.get(CLEAR_URL)
def __init__(self, url, proxy_enabled=0, thread_no=1): self.url = url self.categories = [] self.proxies = {} self.thread_no = thread_no # DO NOT set it as 0 self.db_lock = Lock() if proxy_enabled: self.proxies = proxy.get_proxies()
def get_soups_helper(self, item_list): proxies = get_proxies() for item in item_list: status = item.fetch_soup(proxies=proxies) if status == None: self.item_list.remove(item) continue item.extract_info() with self.db_lock: ItemDb.create(url=item.url, name=item.name, price=item.price)
def write_new_proxies(): # Write the extracted proxies to the file try: proxies = get_proxies() f = open('proxy_list.txt', 'w') for proxy in proxies: f.write(proxy + '\n') f.close() print("DONE") except: print("MAJOR ERROR")
def check(self): """ 检测代理IP数量是否低于阈值 如果低于阈值,则执行添加任务 :return: """ if self.size() <= THRESHOLD: # 低于阈值,添加proxy results = get_proxies() for proxy in results: #proxy = result['ip'] + ':' + result['port'] self.add(proxy) elif self.size() > THRESHOLD: print("还有至少三个proxy可使用")
def spider(page): data = { "bt": "", "fydw": "", "pageNum": page, } for _ in range(5): try: response = requests.post(url, headers=headers, data=data, proxies=get_proxies()) json_data = response.json() except (json.JSONDecodeError, adapters.SSLError): continue else: break else: return {} return json_data
def community_infor(url): while True: try: html=requests.get('http://hf.anjuke.com/'+url,headers=headers,proxies=get_proxies(),timeout=10).text if '请输入图片中的验证码' in html: continue break except Exception as e: print('[community_infor]%s failed'%(url)) soup=BeautifulSoup(html,'lxml').find('div',{'class':'comm-basic-mod'}) item={} try: detail=soup.find('dl',{'class':'basic-parms-mod'}) t**s=detail.find_all('dt') values=detail.find_all('dd') for index in range(len(t**s)): item[t**s[index].get_text().replace('\r','').replace('\n','').replace(' ','').replace(':','').replace('\xa0','')]=values[index].get_text().replace('\r','').replace('\n','').replace(' ','').replace(':','') except: pass while True: try: html=requests.get('http://hf.anjuke.com/ajax/communityext/?commid=%s&useflg=onlyForAjax'%url.split('/')[-2],headers=headers,proxies=get_proxies(),timeout=10).text break except Exception as e: print('[community_infor-json]%s failed'%(url)) data=json.loads(html)['comm_propnum'] try: item['saleNum']=data['saleNum'] except: item['saleNum']='-' try: item['rentNum']=data['rentNum'] except: item['rentNum']='-' keys=['saleNum','rentNum','所在版块','地址','总建面','总户数','建造年代','容积率','停车位','绿化率','出租率'] line='' for key in keys: try: line+=str(item[key])+'|' except: line+='-|' return line
def _get_page_via_proxy(url, retry=3, proxies=None, fpfirst=False): ''' Get the page via given proxy server. ''' start_time = time() for i in range(retry): if proxies: wait_b4_try(i, factor=3) x = proxies else: x = get_proxies() if x: # each proxy server is tried once only # if anything goes wrong, we'll get another one # so it'd better grasp the only chance it'll have p = _get_page(url, retry=1, proxies=x, fpfirst=fpfirst) if p: return p else: LOG.warning('No valid proxy to get page. Continuing.') LOG.warning( 'All %d attempt(s) to get page via proxy failed in %s. Returning nothing.' % (retry, time() - start_time)) return None
def get_community(): page=1 while True: try: html=requests.get('http://hf.anjuke.com/community/p%s'%page,headers=headers,proxies=get_proxies(),timeout=10).text if '请输入图片中的验证码' in html: continue except: continue try: table=BeautifulSoup(html,'lxml').find('div',id='list-content').find_all('div',{'class':'li-itemmod'}) except: break if table==[]: break f=open('urls.txt','a') for item in table: try: url=item.find('a').get('href') name=item.find('a').get('title') except: continue try: price=item.find('div',{'class':'li-side'}).find('strong').get_text() except: price='-' f.write(name+'|'+price+'|'+url+'\n') f.close() print(page) page+=1 if page==51: break
return self.db.lrem(REDIS_KEY, 0, p) def delete_all(self): """ 一键删除key的所有值 :return: """ return self.db.delete(REDIS_KEY) def check_proxy(self, ip, port): """ 检测代理是否失效 :param ip: :param port: :return: """ try: Telnet().open(ip, port, timeout=3) return True except Exception: return False if __name__ == "__main__": db = REDISCLIENT() dict_ = get_proxies() for proxy in dict_: #proxy = result['ip'] + ":" + result['port'] db.add(proxy) db.check()
NAME = 0 token = open('token', 'r').read().strip() bot = Bot(token=token) updater = Updater(token=token, use_context=True) dispatcher = updater.dispatcher j = updater.job_queue logging.basicConfig( filename="log", level=logging.ERROR, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') server = Server() proxies = get_proxies() def callback_alarm(context: CallbackContext): """ This is called every specified minutes to notify users if any item in their watchlist changes """ logging.log(logging.ERROR, "Notifying Users") for user_id, user in server.users.items(): updated_items = user.check_prices() if updated_items != "": context.bot.send_message(chat_id=user_id, text=updated_items, parse_mode=ParseMode.MARKDOWN) else: # DEBUG