def fetch_post_image_links(url, post_pages): print_log('开始收集帖子全部图片链接') post_content_reg = re.compile(setting.post_content_pattern, re.S) link_reg = re.compile(setting.img_link_pattern) link_reg2 = re.compile(setting.img_link_with_third_site_pattern) img_links = [] for page in range(1, post_pages + 1): print '当前页数:%s\r' % page, curl_page_url = utils.make_url_with_page_num(url, page) whole_content = get_url_content(curl_page_url) if whole_content: content = (re.findall(post_content_reg, whole_content))[0] origin_img_links = re.findall(link_reg, content) + re.findall(link_reg2, content) img_links += [utils.make_real_img_link(link) for link in origin_img_links] print_log('收集链接完毕') return sorted(utils.clean_str_list(img_links))
def remove_repeat_img_links(record_file_name, img_links): if os.path.exists(record_file_name): download_records_file = open(record_file_name, 'r+') # 下载过的图片链接 existed_links = utils.clean_str_list(download_records_file.readlines()) # 新增链接(所有链接减去已下载链接) new_links = sorted(list(set(img_links) - set(existed_links))) # 将新增连接追加写入至记录文件 (废弃,改为下好一张存一张的地址) # download_records_file.writelines([link + '\n' for link in new_links]) download_records_file.close() return new_links else: # download_records_file = open(record_file_path, 'w') # download_records_file.writelines([link + '\n' for link in img_links]) # download_records_file.close() return img_links