def get_pic_set_page(url): url_list = [] proxy_ip = t.get_proxy_ip() soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip)) divs = soup.find('div', attrs={'class', 'pages'}) a_s = divs.findAll('a', attrs={'class', 'num'}) for a in a_s: url_list.append(a['href']) return url_list
def catch_all_boards(user_url): proxy_ip = t.get_proxy_ip() resp = coderpig.get_resp(user_url, proxy=proxy_ip).decode('utf-8') result = boards_pattern.search(resp) json_dict = json.loads(result.group(1)) for item in json_dict: coderpig.write_str_data(item['title'] + ':' + str(item['board_id']), board_ids_file) # 返回最后一个board_id board_id = json_dict[-1]['board_id'] return board_id
def get_tag_url(): print("================================================== 检测有效的tag页:\n") for i in range(2, 101): proxy_ip = t.get_proxy_ip() tag_url = host_url + '/meinvtag' + str(i) + '_1.html' resp = coderpig.get_resp(tag_url, proxy=proxy_ip, read=False) if resp is not None: if resp.getcode() == 200: soup = coderpig.get_bs(resp.read()) coderpig.write_str_data(soup.find('h2').get_text() + "-" + tag_url, tag_url_file)
def get_tag_url(): print("================================================== 检测有效的tag页:\n") for i in range(2, 101): proxy_ip = t.get_proxy_ip() tag_url = host_url + '/meinvtag' + str(i) + '_1.html' resp = coderpig.get_resp(tag_url, proxy=proxy_ip, read=False) if resp is not None: if resp.getcode() == 200: soup = coderpig.get_bs(resp.read()) coderpig.write_str_data( soup.find('h2').get_text() + "-" + tag_url, tag_url_file)
def read_article_url(url): while True: proxy_ip = tools.get_proxy_ip() try: resp = requests.get(url, headers=headers, proxies=proxy_ip, timeout=5) if resp is not None and resp.status_code == 200: global read_count read_count += 1 print("累计访问成功次数: %d" % read_count) return None except Exception as e: pass
def get_boards_index_data(url, pic_save_dir): print(url) proxy_ip = t.get_proxy_ip() resp = coderpig.get_resp(url, proxy=proxy_ip).decode('utf-8') result = pins_pattern.search(resp) json_dict = json.loads(result.group(1)) if len(json_dict) > 0: for item in json_dict: coderpig.write_str_data(pic_save_dir + ':' + item['file']['key'], pin_keys_file) # 返回最后一个pin_id pin_id = json_dict[-1]['pin_id'] return pin_id
def read_article_url(url): while True: proxy_ip = tools.get_proxy_ip() try: resp = requests.get(url, headers=headers, proxies=proxy_ip, timeout=5) if resp is not None and resp.status_code == 200: global read_count read_count += 1 print("累计访问成功次数: %d" % read_count) return None except Exception as e: print(e)
def download_pic(pic_key, pic_dir): proxy_ip = t.get_proxy_ip() coderpig.is_dir_existed(pic_download_dir) url = img_start_url + pic_key + img_end resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header) try: print("下载图片:" + url) pic_name = pic_key + ".jpg" with open(pic_dir + pic_name, "wb+") as f: f.write(resp) except (OSError, urllib.error.HTTPError, urllib.error.URLError, Exception) as reason: print(str(reason))
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs(coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def download_pic(img_url): while True: proxy_ip = tools.get_proxy_ip() try: resp = requests.get(img_url, headers=headers, proxies=proxy_ip, timeout=5) if resp is not None: print("下载图片:" + resp.request.url) pic_name = img_url.split("/")[-1] with open(pic_save_dir + pic_name, "wb+") as f: f.write(resp.content) return None except Exception as e: pass
def catch_pic_diagrams(url, tag): soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8')) title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text() pic_path = pic_save_path + tag + '/' + title + '/' coderpig.is_dir_existed(pic_path) ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'}) lis = ul.findAll('li') for li in lis: pic_soup = coderpig.get_bs( coderpig.get_resp(li.a['href']).decode('utf-8')) pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'}) pic_url = pic_div.find('img')['data-original'] proxy_ip = t.get_proxy_ip() coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
def catch_json_boards(url): proxy_ip = t.get_proxy_ip() print("获取画板Json:" + url) resp = coderpig.get_resp(url, headers=json_headers, proxy=proxy_ip).decode('utf-8') if resp is None: return None else: json_dict = json.loads(resp) boards = json_dict['user']['boards'] if len(boards) == 0: return None else: for item in boards: coderpig.write_str_data(item['title'] + ':' + str(item['board_id']), board_ids_file) return boards[-1]['board_id']
def get_json_list(url, pic_save_dir): proxy_ip = t.get_proxy_ip() print("获取json:" + url) resp = coderpig.get_resp(url, headers=json_headers, proxy=proxy_ip).decode('utf-8') if resp is None: return None else: json_dict = json.loads(resp) pins = json_dict['board']['pins'] if len(pins) == 0: return None else: for item in pins: coderpig.write_str_data(pic_save_dir + ':' + item['file']['key'], pin_keys_file) return pins[-1]['pin_id']
def get_json_list(url, pic_save_dir): proxy_ip = t.get_proxy_ip() print("获取json:" + url) resp = coderpig.get_resp(url, headers=json_headers, proxy=proxy_ip).decode('utf-8') if resp is None: return None else: json_dict = json.loads(resp) pins = json_dict['board']['pins'] if len(pins) == 0: return None else: for item in pins: coderpig.write_str_data( pic_save_dir + ':' + item['file']['key'], pin_keys_file) return pins[-1]['pin_id']
def download_pic(pic_data): split = pic_data.split("~") pic_dir = c.ZZS_FLS_MZT_SAVE_PATH + split[0] + "/" pic_url = split[1] t.is_dir_existed(pic_dir) while True: proxy_ip = t.get_proxy_ip() print(proxy_ip) try: resp = requests.get(pic_url, proxies=proxy_ip, timeout=5) if resp is not None: print("下载图片:" + resp.request.url) pic_name = pic_url.split("/")[-1] with open(pic_dir + pic_name, "wb+") as f: f.write(resp.content) return None except Exception as e: print(e)
def catch_json_boards(url): proxy_ip = t.get_proxy_ip() print("获取画板Json:" + url) resp = coderpig.get_resp(url, headers=json_headers, proxy=proxy_ip).decode('utf-8') if resp is None: return None else: json_dict = json.loads(resp) boards = json_dict['user']['boards'] if len(boards) == 0: return None else: for item in boards: coderpig.write_str_data( item['title'] + ':' + str(item['board_id']), board_ids_file) return boards[-1]['board_id']
def download_pic(pic_data): split = pic_data.split("~") pic_dir = pic_save_path + split[0] + "/" pic_url = split[1] t.is_dir_existed(pic_dir) while True: proxy_ip = t.get_proxy_ip() print(proxy_ip) try: resp = requests.get(pic_url, proxies=proxy_ip, timeout=5) if resp is not None: print("下载图片:" + resp.request.url) pic_name = pic_url.split("/")[-1] with open(pic_dir + pic_name, "wb+") as f: f.write(resp.content) return None except Exception as e: print(e)