def get_toppost100(params): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.get(toppost100_url, params=params, headers=toppost100_headers, proxies=proxy_ip, timeout=5) if resp is not None: print("抓取:" + resp.request.url) soup = cpn.get_bs(resp.text) ul = soup.find( 'ul', attrs={ 'class': 'l-clearfix gridList workImageCards js-workTopList' }) lis = ul.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={ 'class': 'fz14 text cut' }).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def get_ajax_data(data): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.post(ajax_url, data=data, headers=ajax_headers, proxies=proxy_ip, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) lis = soup.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={ 'class': 'fz14 text cut' }).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def get_article_url(url): try: resp = requests.get(url, headers=headers, timeout=5) if resp is not None: print("解析:" + resp.request.url) soup = cpn.get_bs(resp.text) div = soup.find('div', attrs={'id': 'article_list'}) spans = div.findAll('span', attrs={'class': 'link_title'}) for span in spans: cpn.write_str_data(base_url + span.find('a')['href'], articles_file) return None except Exception as e: print(str(e))
def get_pics(count): while True: params = {'pn': count, 'ajax': '1', 't': int(time.time())} try: resp = requests.get(tiezi_url, headers=headers, timeout=5, params=params) if resp is not None: soup = cpn.get_bs(resp.text) imgs = soup.findAll('img', attrs={'class': 'BDE_Image'}) for img in imgs: cpn.write_str_data(img['src'], pic_urls_file) return None except Exception as e: pass pass
def get_pics(count): while True: params = { 'pn': count, 'ajax': '1', 't': int(time.time()) } try: resp = requests.get(tiezi_url, headers=headers, timeout=5, params=params) if resp is not None: soup = cpn.get_bs(resp.text) imgs = soup.findAll('img', attrs={'class': 'BDE_Image'}) for img in imgs: cpn.write_str_data(img['src'], pic_urls_file) return None except Exception as e: pass pass
def get_ajax_data(data): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.post(ajax_url, data=data, headers=ajax_headers, proxies=proxy_ip, timeout=5) if resp is not None: soup = cpn.get_bs(resp.text) lis = soup.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))
def get_toppost100(params): while True: proxy_ip = cpn.get_proxy_ip() try: resp = requests.get(toppost100_url, params=params, headers=toppost100_headers, proxies=proxy_ip, timeout=5) if resp is not None: print("抓取:" + resp.request.url) soup = cpn.get_bs(resp.text) ul = soup.find('ul', attrs={'class': 'l-clearfix gridList workImageCards js-workTopList'}) lis = ul.findAll('li') for li in lis: img = li.find('img', attrs={'class': 'cardImage'})['src'][:-4] if not img == '': name = li.find('p', attrs={'class': 'fz14 text cut'}).get_text().strip() if name == '': name = str(int(time.time())) cpn.write_str_data(name + "Θ" + img, pic_urls_file) return None except Exception as e: print(threading.current_thread().name + "~" + str(e))