def catch_pic_diagrams(url, tag):
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text()
    pic_path = pic_save_path + tag + '/' + title + '/'
    coderpig.is_dir_existed(pic_path)
    ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'})
    lis = ul.findAll('li')
    for li in lis:
        pic_soup = coderpig.get_bs(coderpig.get_resp(li.a['href']).decode('utf-8'))
        pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'})
        pic_url = pic_div.find('img')['data-original']
        proxy_ip = t.get_proxy_ip()
        coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
Exemplo n.º 2
0
def catch_pic_diagrams(url, tag):
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    title = soup.find('div', attrs={'class': 'ptitle'}).h1.get_text()
    pic_path = pic_save_path + tag + '/' + title + '/'
    coderpig.is_dir_existed(pic_path)
    ul = soup.find('ul', attrs={'class': 'scroll-img scroll-img02 clearfix'})
    lis = ul.findAll('li')
    for li in lis:
        pic_soup = coderpig.get_bs(
            coderpig.get_resp(li.a['href']).decode('utf-8'))
        pic_div = pic_soup.find('div', attrs={'id': 'pic-meinv'})
        pic_url = pic_div.find('img')['data-original']
        proxy_ip = t.get_proxy_ip()
        coderpig.download_pic(pic_url, pic_path, proxy=proxy_ip)
Exemplo n.º 3
0
def catch_pic_diagrams(url):
    resp = coderpig.get_resp(url).decode('utf-8')
    soup = coderpig.get_bs(resp)
    dir_name = soup.find('title').get_text()[:-5]
    save_path = pic_save_path + dir_name + '/'
    coderpig.is_dir_existed(save_path)
    # 通过末页获取总共有多少页
    page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1))
    for page in range(1, page_count + 1):
        page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8')
        page_soup = coderpig.get_bs(page_resp)
        # 获取本页的图片
        imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img')
        for img in imgs:
            coderpig.download_pic(img['src'], save_path)
Exemplo n.º 4
0
def catch_pic_diagrams(url):
    resp = coderpig.get_resp(url).decode('utf-8')
    soup = coderpig.get_bs(resp)
    dir_name = soup.find('title').get_text()[:-5]
    save_path = pic_save_path + dir_name + '/'
    coderpig.is_dir_existed(save_path)
    # 通过末页获取总共有多少页
    page_count = int(moye_pattern.match(soup.find('a', text='末页')['href']).group(1))
    for page in range(1, page_count + 1):
        page_resp = coderpig.get_resp(url.replace('.html', '_' + str(page) + '.html')).decode('utf-8')
        page_soup = coderpig.get_bs(page_resp)
        # 获取本页的图片
        imgs = page_soup.find('p', attrs={'align': 'center'}).findAll('img')
        for img in imgs:
            coderpig.download_pic(img['src'], save_path)
Exemplo n.º 5
0
def read_article_url(url):
    proxy_ip = coderpig.get_proxy_ip()
    resp = coderpig.get_resp(url, read=False, headers=headers, proxy=proxy_ip)
    if (resp is not None) and (resp.getcode() == 200):
        global read_count
        read_count += 1
        print("累计访问成功次数: %d" % read_count)
Exemplo n.º 6
0
def catch_pic_diagrams_url(url):
    url_list = []
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    articles = soup.findAll('article', attrs={'class': 'excerpt'})
    for article in articles:
        url_list.append(article.a['href'])
    return url_list
Exemplo n.º 7
0
def get_page_count():
    proxy_ip = coderpig.get_proxy_ip()
    soup = coderpig.get_bs(
        coderpig.get_resp(content_url, headers=headers,
                          proxy=proxy_ip).decode('utf-8'))
    div = soup.find('div', attrs={'id': 'papelist'})
    page_count = (div.findAll('a')[-1]['href']).split('/')[-1]
    return page_count
Exemplo n.º 8
0
def get_city_list_url():
    city_list_url = []
    weather_hb_soup = coderpig.get_bs(coderpig.get_resp(weather_hb_url).decode('utf-8'))
    weather_box = weather_hb_soup.find(attrs={'class': 'lqcontentBoxheader'})
    weather_a_list = weather_box.findAll('a')
    for i in weather_a_list:
        city_list_url.append(weather_base_url + i['href'])
    return city_list_url
Exemplo n.º 9
0
def get_pic_set(url):
    url_list = []
    proxy_ip = coderpig.get_proxy_ip()
    soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip))
    divs = soup.findAll('div', attrs={'class', 'tab_tj'})
    a_s = divs[1].findAll('a')
    for a in a_s:
        url_list.append(a['href'])
    return url_list
Exemplo n.º 10
0
def get_pic_set_page(url):
    url_list = []
    proxy_ip = t.get_proxy_ip()
    soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip))
    divs = soup.find('div', attrs={'class', 'pages'})
    a_s = divs.findAll('a', attrs={'class', 'num'})
    for a in a_s:
        url_list.append(a['href'])
    return url_list
Exemplo n.º 11
0
def catch_pic_diagrams_url(url):
    url_list = []
    soup = coderpig.get_bs(coderpig.get_resp(url))
    div = soup.find('div', attrs={'taotu-main'})
    lis = div.findAll('li')
    for li in lis:
        if li._class != 'longword':
            url_list.append((base_url + li.find('a')['href']))
    return url_list
Exemplo n.º 12
0
def get_pic_set_page(url):
    url_list = []
    proxy_ip = t.get_proxy_ip()
    soup = coderpig.get_bs(coderpig.get_resp(url, proxy=proxy_ip))
    divs = soup.find('div', attrs={'class', 'pages'})
    a_s = divs.findAll('a', attrs={'class', 'num'})
    for a in a_s:
        url_list.append(a['href'])
    return url_list
Exemplo n.º 13
0
def catch_pic_diagrams_url(url):
    url_list = []
    soup = coderpig.get_bs(coderpig.get_resp(url))
    div = soup.find('div', attrs={'taotu-main'})
    lis = div.findAll('li')
    for li in lis:
        if li._class != 'longword':
            url_list.append((base_url + li.find('a')['href']))
    return url_list
Exemplo n.º 14
0
def get_tag_url():
    print("================================================== 检测有效的tag页:\n")
    for i in range(2, 101):
        proxy_ip = t.get_proxy_ip()
        tag_url = host_url + '/meinvtag' + str(i) + '_1.html'
        resp = coderpig.get_resp(tag_url, proxy=proxy_ip, read=False)
        if resp is not None:
            if resp.getcode() == 200:
                soup = coderpig.get_bs(resp.read())
                coderpig.write_str_data(soup.find('h2').get_text() + "-" + tag_url, tag_url_file)
Exemplo n.º 15
0
def get_article_url(url):
    proxy_ip = coderpig.get_proxy_ip()
    soup = coderpig.get_bs(
        coderpig.get_resp(url, headers=headers,
                          proxy=proxy_ip).decode('utf-8'))
    div = soup.find('div', attrs={'class': 'list_item_new'})
    spans = div.findAll('span', attrs={'class': 'link_title'})
    for span in spans:
        coderpig.write_str_data(base_url + span.find('a')['href'],
                                articles_file)
def cat_code_list():
    result_list = []
    soup = coderpig.get_bs(coderpig.get_resp(base_url))
    areacode = soup.find('areacode').get_text()
    city_list = areacode.split("\n")
    for i in city_list[2:]:
        result = city_pattern.match(i)
        if result is not None:
            result_list.append(result.group(1) + ":" + result.group(2))
    return result_list
Exemplo n.º 17
0
def catch_all_boards(user_url):
    proxy_ip = t.get_proxy_ip()
    resp = coderpig.get_resp(user_url, proxy=proxy_ip).decode('utf-8')
    result = boards_pattern.search(resp)
    json_dict = json.loads(result.group(1))
    for item in json_dict:
        coderpig.write_str_data(item['title'] + ':' + str(item['board_id']), board_ids_file)
    # 返回最后一个board_id
    board_id = json_dict[-1]['board_id']
    return board_id
Exemplo n.º 18
0
def catch_pic_diagrams(url):
    soup = coderpig.get_bs(coderpig.get_resp(url).decode('utf-8'))
    # 先拿标题建文件夹:
    article_header = soup.find('header', attrs={'class': 'article-header'}).find('a').get_text().replace(':', " ")
    save_path = pic_save_path + article_header + "/"
    coderpig.is_dir_existed(save_path)
    print("开始下载:" + article_header)
    # 拿图片url
    imgs = soup.find('article').findAll('img')
    for img in imgs[:-1]:
        coderpig.download_pic(img['src'].lstrip('/'), save_path)
Exemplo n.º 19
0
def get_tag_url():
    print("================================================== 检测有效的tag页:\n")
    for i in range(2, 101):
        proxy_ip = t.get_proxy_ip()
        tag_url = host_url + '/meinvtag' + str(i) + '_1.html'
        resp = coderpig.get_resp(tag_url, proxy=proxy_ip, read=False)
        if resp is not None:
            if resp.getcode() == 200:
                soup = coderpig.get_bs(resp.read())
                coderpig.write_str_data(
                    soup.find('h2').get_text() + "-" + tag_url, tag_url_file)
Exemplo n.º 20
0
def catch_all_boards(user_url):
    proxy_ip = coderpig.get_proxy_ip()
    resp = coderpig.get_resp(user_url, proxy=proxy_ip).decode('utf-8')
    result = boards_pattern.search(resp)
    json_dict = json.loads(result.group(1))
    for item in json_dict:
        coderpig.write_str_data(item['title'] + ':' + str(item['board_id']),
                                board_ids_file)
    # 返回最后一个board_id
    board_id = json_dict[-1]['board_id']
    return board_id
Exemplo n.º 21
0
def fetch_json(url):
    data = str(coderpig.get_resp(url).decode('utf-8'))
    data = json.loads(data)
    result_list = data['postList']
    for result in result_list:
        save_path = pic_save_path + result['post_id'] + '/'
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        pic_list = get_pic_url_list(result['url'])
        for pic in pic_list:
            coderpig.download_pic(pic, save_path)
Exemplo n.º 22
0
def get_boards_index_data(url, pic_save_dir):
    print(url)
    proxy_ip = t.get_proxy_ip()
    resp = coderpig.get_resp(url, proxy=proxy_ip).decode('utf-8')
    result = pins_pattern.search(resp)
    json_dict = json.loads(result.group(1))
    if len(json_dict) > 0:
        for item in json_dict:
            coderpig.write_str_data(pic_save_dir + ':' + item['file']['key'], pin_keys_file)
        # 返回最后一个pin_id
        pin_id = json_dict[-1]['pin_id']
        return pin_id
Exemplo n.º 23
0
def download_pic(pic_key, pic_dir):
    proxy_ip = t.get_proxy_ip()
    coderpig.is_dir_existed(pic_download_dir)
    url = img_start_url + pic_key + img_end
    resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header)
    try:
        print("下载图片:" + url)
        pic_name = pic_key + ".jpg"
        with open(pic_dir + pic_name, "wb+") as f:
            f.write(resp)
    except (OSError, urllib.error.HTTPError, urllib.error.URLError, Exception) as reason:
        print(str(reason))
Exemplo n.º 24
0
def download_pic(pic_key, pic_dir):
    proxy_ip = coderpig.get_proxy_ip()
    coderpig.is_dir_existed(pic_download_dir)
    url = img_start_url + pic_key + img_end
    resp = coderpig.get_resp(url, proxy=proxy_ip, headers=referrer_header)
    try:
        print("下载图片:" + url)
        pic_name = pic_key + ".jpg"
        with open(pic_dir + pic_name, "wb+") as f:
            f.write(resp)
    except (OSError, urllib.error.HTTPError, urllib.error.URLError,
            Exception) as reason:
        print(str(reason))
Exemplo n.º 25
0
def get_boards_index_data(url, pic_save_dir):
    print(url)
    proxy_ip = coderpig.get_proxy_ip()
    resp = coderpig.get_resp(url, proxy=proxy_ip).decode('utf-8')
    result = pins_pattern.search(resp)
    json_dict = json.loads(result.group(1))
    if len(json_dict) > 0:
        for item in json_dict:
            coderpig.write_str_data(pic_save_dir + ':' + item['file']['key'],
                                    pin_keys_file)
        # 返回最后一个pin_id
        pin_id = json_dict[-1]['pin_id']
        return pin_id
def download_pic(url):
    correct_url = url
    if url.startswith('//'):
        correct_url = url[2:]
    if not url.startswith('http'):
        correct_url = 'http://' + correct_url
    print(correct_url)
    try:
        pic = coderpig.get_resp(correct_url, headers=headers)
        pic_name = correct_url.split("/")[-1]
        with open(pic_save_path + pic_name, "wb+") as f:
            f.write(pic)
    except (OSError, urllib.error.HTTPError, urllib.error.URLError, Exception) as reason:
        print(str(reason))
Exemplo n.º 27
0
def get_json_list(url, pic_save_dir):
    proxy_ip = t.get_proxy_ip()
    print("获取json:" + url)
    resp = coderpig.get_resp(url, headers=json_headers, proxy=proxy_ip).decode('utf-8')
    if resp is None:
        return None
    else:
        json_dict = json.loads(resp)
        pins = json_dict['board']['pins']
        if len(pins) == 0:
            return None
        else:
            for item in pins:
                coderpig.write_str_data(pic_save_dir + ':' + item['file']['key'], pin_keys_file)
            return pins[-1]['pin_id']
Exemplo n.º 28
0
def catch_json_boards(url):
    proxy_ip = t.get_proxy_ip()
    print("获取画板Json:" + url)
    resp = coderpig.get_resp(url, headers=json_headers, proxy=proxy_ip).decode('utf-8')
    if resp is None:
        return None
    else:
        json_dict = json.loads(resp)
        boards = json_dict['user']['boards']
        if len(boards) == 0:
            return None
        else:
            for item in boards:
                coderpig.write_str_data(item['title'] + ':' + str(item['board_id']), board_ids_file)
            return boards[-1]['board_id']
Exemplo n.º 29
0
def get_json_list(url, pic_save_dir):
    proxy_ip = coderpig.get_proxy_ip()
    print("获取json:" + url)
    resp = coderpig.get_resp(url, headers=json_headers,
                             proxy=proxy_ip).decode('utf-8')
    if resp is None:
        return None
    else:
        json_dict = json.loads(resp)
        pins = json_dict['board']['pins']
        if len(pins) == 0:
            return None
        else:
            for item in pins:
                coderpig.write_str_data(
                    pic_save_dir + ':' + item['file']['key'], pin_keys_file)
            return pins[-1]['pin_id']
Exemplo n.º 30
0
def catch_json_boards(url):
    proxy_ip = coderpig.get_proxy_ip()
    print("获取画板Json:" + url)
    resp = coderpig.get_resp(url, headers=json_headers,
                             proxy=proxy_ip).decode('utf-8')
    if resp is None:
        return None
    else:
        json_dict = json.loads(resp)
        boards = json_dict['user']['boards']
        if len(boards) == 0:
            return None
        else:
            for item in boards:
                coderpig.write_str_data(
                    item['title'] + ':' + str(item['board_id']),
                    board_ids_file)
            return boards[-1]['board_id']
Exemplo n.º 31
0
def get_city_code(city_list_url):
    city_code_dict = {}  # 创建一个空字典
    city_pattern = re.compile(r'^<a.*?weather/(.*?).s.*</a>$')  # 获取城市编码的正则
    weather_hb_soup = coderpig.get_bs(coderpig.get_resp(city_list_url).decode('utf-8'))
    # 需要过滤一波无效的
    div_conMidtab = weather_hb_soup.find_all(attrs={'class': 'conMidtab', 'style': ''})

    for mid in div_conMidtab:
        tab3 = mid.find_all(attrs={'class': 'conMidtab3'})
        for tab in tab3:
            trs = tab.findAll('tr')
            for tr in trs:
                a_list = tr.findAll('a')
                for a in a_list:
                    if a.get_text() != "详情":
                        # 正则拿到城市编码
                        city_code = city_pattern.match(str(a)).group(1)
                        city_name = a.string
                        city_code_dict[city_code] = city_name
        return city_code_dict
Exemplo n.º 32
0
def fetch_meizi_pic(url):
    data = str(coderpig.get_resp(url).decode('utf-8'))
    data = json.loads(data)
    result_list = data['results']
    for result in result_list:
        coderpig.download_pic(result['url'], pic_save_path)