Exemplo n.º 1
0
def parse_blogs_info(blogs_urls):
    global pre_page_last_img_info
    imgs_info = []
    blog_num = 0
    blen = len(blogs_urls)
    # 循环len(blogs_info)次,每次解析blogs_info的第一元素,解析完后删除
    for blog_url in blogs_urls:
        print("博客 %s 开始解析" % (blog_url))
        content = requests.get(blog_url, headers=useragentutil.get_headers()).content.decode("utf-8")
        author_page_parse = etree.HTML(requests.get(blog_url.split("/post")[0]).content.decode("utf-8"))
        author_name = author_page_parse.xpath("//title/text()")[0].replace("\n", "").replace(" ", "")
        author_ip = re.search(r"http(s)*://(.*).lofter.com/", blog_url).group(2)
        # 获取博客发表时间
        public_time = get_time(blog_url, author_page_parse)

        # 不同作者主页会有不同页面结构,所以没有使用xpath而是直接用正则匹配出所有的图片链接
        # imgs_url = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)"', content)
        imgs_url = re.findall('"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)"', content)

        # 图片文件下标接上次的增加
        img_index = 0
        blog_num += 1

        # 过滤博客页面中获取到的图片链接
        imgs_url = l4_author_img.img_fliter(imgs_url, "img")
        print(imgs_url)

        # 整理图片信息,用于下一步保存
        for img_url in imgs_url:
            img_index += 1
            # 判断图片类型
            is_gif = re.findall("gif", img_url)
            is_png = re.findall("png", img_url)
            if is_gif:
                img_type = "gif"
            elif is_png:
                img_type = "png"
            else:
                img_type = "jpg"
            img_info = {}
            img_info["img_url"] = img_url
            author_name_in_filename = author_name.replace("/", "&").replace("|", "&").replace("\\", "&"). \
                replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?"). \
                replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")")
            img_info["pic_name"] = author_name_in_filename + "[" + author_ip + "] " + public_time + "(" + str(
                img_index) + ")." + img_type
            imgs_info.append(img_info)
            # 这里想遇到png的时候存一张png格式,一张jpg格式。png格式有透明图层有的要在jpg格式下才能看全
            # 但总之效果不好所以弃用
            # if img_type == "png":
            #     img_info_2={}
            #     img_info_2["img_url"] = img_url
            #     img_info_2["pic_name"] = author_name + "[" + author_ip + "] " + public_time + "(" + str(
            #         img_index) + ").jpg"
            #     imgs_info.append(img_info_2)

        blen -= 1
        print("解析完成,获取到图片链接%d,总获取图片数%d,已解析完成%d个链接,剩余%d" % (len(imgs_url), len(imgs_info), blog_num, blen))
        print()
    return imgs_info
Exemplo n.º 2
0
def parse_proxy_url(temp_url):
    '''爬取请求解析网站,获取网页源码'''
    request = urllib.request.Request(temp_url,
                                     headers=useragentutil.get_headers())
    #发送请求,获得结果
    rsp = urllib.request.urlopen(request).read().decode()
    return rsp
Exemplo n.º 3
0
def save_long_article(long_articles_info, file_path, save_img_in_text):
    if not os.path.exists(file_path + "/long article"):
        os.makedirs(file_path + "/long article")
    count = 0
    is_tag_null = lambda x: x if x != "" else "无"
    for l_info in long_articles_info:
        # 文档整理
        l_head = l_info["title"] + " by " + l_info["author name"] + "[" + l_info["author ip"] + "]" + "\n发表时间:" \
                 + l_info["public time"] + "\n原文连接:" + l_info["url"] + "\ntags:" + \
                 is_tag_null(", ".join(l_info["tags"]))
        l_tail = ""
        if l_info["long article url"]:
            l_tail += "文章中包含的外部连接"
            for external_links in l_info["long article url"]:
                l_tail += "\n" + external_links
        if l_info["long article img"]:
            l_tail += "\n\n文章中包含的图片连接:"
            for illustration in l_info["long article img"]:
                l_tail += "\n" + illustration
        long_article = l_head + "\n\n\n" + l_info[
            "long article content"] + "\n\n\n" + l_tail
        filename = l_info["title in filename"] + " by " + l_info[
            "author name in filename"] + ".txt"

        count += 1
        # 输出
        if count % 5 == 0 or count == len(long_articles_info) or count == 1:
            print("保存进度 {}/{}\t\t{}".format(count, len(long_articles_info),
                                            filename),
                  end="\t\t")
        # 保存
        write_text(long_article, filename, file_path + "/long article")
        # 保存文本中的图片
        if save_img_in_text:
            if l_info["long article img"]:
                for img_url in l_info["long article img"]:
                    # re_url = re.findall('http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*', img_url)
                    re_url = re.findall(
                        'http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*',
                        img_url)
                    if not re_url:
                        print("\n图片 {} 不是lofter站内图 可能会保存失败".format(img_url),
                              end="\t")
                    try:
                        img = requests.get(
                            img_url,
                            headers=useragentutil.get_headers()).content
                    except:
                        print("保存失败,请尝试手动保存", end="\t")
                        continue

                    img_name = l_info["title in filename" \
                                      ""] + " by " + l_info["author name in filename"] + ".jpg"
                    img_name = filename_check(img_name, img,
                                              file_path + "/long article",
                                              "jpg")
                    write_img(img, img_name, file_path + "/long article")

        if count % 5 == 0 or count == len(long_articles_info) or count == 1:
            print("保存完成")
Exemplo n.º 4
0
def main():
    """下载皮肤图片"""
    #读取json文件
    hero_lists = read_hero_skin_by_json()
    #遍历
    for hero_element in hero_lists:
        hero_name = hero_element["hero_name"]
        hero_skins = hero_element["hero_skin_list"]

        # [{'skin_name': '流云之翼', 'skin_url ': 'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/506/506-bigskin-1.jpg'},

        headers = useragentutil.get_headers()
        for skins_element in hero_skins:
            #皮肤名称
            skin_name = skins_element["skin_name"]
            #皮肤url
            skin_url = skins_element["skin_url"]
            #下载图片
            response = requests.get(skin_url, headers=headers)
            image_content = response.content

            #文件操作保存图片
            with open("./herofile/" + hero_name + "/" + skin_name + ".jpg",
                      "wb") as image_file:
                image_file.write(image_content)
            print("正在下载——>%s-->皮肤%s图片.." % (hero_name, skin_name))
        print("----%s的所有皮肤图片下载成功" % hero_name)
    print("所有英雄图片下载成功")
Exemplo n.º 5
0
def get_html_datas(url_datas):
    """爬取英雄界面network代码"""
    headers = useragentutil.get_headers()
    #获取页面内容
    driver = webdriver.PhantomJS("./phantomjs-2.1.1-windows/bin/phantomjs.exe")
    driver.get(url_datas)
    html_content = driver.page_source
    driver.quit()
    return html_content
Exemplo n.º 6
0
 def parse_offer_url(self, temp_url):
     """爬取整个页面内容"""
     offer_response = requests.get(temp_url,
                                   headers=useragentutil.get_headers(),
                                   proxies=proxypool.get_proxy())
     offer_html_content = offer_response.content.decode("utf-8")
     # 限制处理
     wait_time = random.randint(0, 5)
     print("动态限制访问频率,%ds 后继续爬取数据..." % wait_time)
     time.sleep(wait_time)
     return offer_html_content
Exemplo n.º 7
0
def save_text(texts_info, file_path, save_img_in_text):
    if not os.path.exists(file_path + "/text"):
        os.makedirs(file_path + "/text")
    count = 0
    is_tag_null = lambda x: x if x != "" else "无"
    for text_info in texts_info:
        count += 1
        # 文档整理
        text_head = text_info["author name"] + "[" + text_info["author ip"] + "]\n发表时间:" + text_info["public time"] \
                    + "\n原文连接:" + text_info["url"] + "\ntags:" + is_tag_null(", ".join(text_info["tags"]))
        text_tial = get_tail(text_info)
        text = text_head + "\n\n\n" + text_info[
            "content"] + "\n\n\n" + text_tial

        first_tag = "无tag"
        if text_info["tags"]:
            first_tag = text_info["tags"][0]
        filename = text_info[
            "author name in filename"] + "-" + first_tag + "-" + text_info[
                "public time"] + ".txt"
        filename = filename_check(filename, text, file_path + "/text", "txt")

        # 提示输出
        if count % 10 == 0 or count == len(texts_info):
            try:
                print("保存进度 {}/{}\t\t{}".format(count, len(texts_info),
                                                filename),
                      end="\t\t")
            except:
                print("保存进度 {}/{}\t\t{}".format(count, len(texts_info),
                                                "文件名异常"),
                      end="\t\t")

        # 保存
        write_text(text, filename, file_path + "/text")
        # 保存文字中的图片

        if save_img_in_text:
            if text_info["illustration"]:
                for img_url in text_info["illustration"]:
                    img_name = text_info[
                        "author name in filename"] + "-" + first_tag + "-" + text_info[
                            "public time"] + ".jpg"

                    # img_name = text_info + " by " + text_info["author name in filename"] + ".jpg"
                    img = requests.get(
                        img_url, headers=useragentutil.get_headers()).content
                    img_name = filename_check(img_name, img,
                                              file_path + "/text", "jpg")
                    write_img(img, img_name, file_path + "/text")
        if count % 10 == 0 or count == len(texts_info):
            print("保存完成")
Exemplo n.º 8
0
 def catch_work_info(self, temp_url):
     """提取工作职责信息"""
     try:
         work_response = requests.get(temp_url,
                                      headers=useragentutil.get_headers(),
                                      proxies=proxypool.get_proxy())
         work_html_content = work_response.content.decode("gbk")
         work_parser = lxml.html.etree.HTML(work_html_content)
         work_infos = "".join(
             work_parser.xpath("//div[@class='bmsg job_msg inbox']//text()")
         ).strip().replace(" ", "")  # 清洗数据
         #print("工作职责:",work_infos)
     except Exception:
         work_infos = "暂无数据"
     return work_infos
Exemplo n.º 9
0
 def get_offer_pages(self):
     """动态获取页面数,int"""
     offer_page_response = requests.get(self.offer_index_url,
                                        headers=useragentutil.get_headers(),
                                        proxies=proxypool.get_proxy())
     # 获取网页源码
     page_html_content = offer_page_response.content.decode("gbk")
     # 解析数据
     metree = lxml.html.etree
     page_parser = metree.HTML(page_html_content)
     # 获得内容值
     pages_content = page_parser.xpath(
         "//div[@class='dw_page']//span[@class='td']/text()")[0]
     pages = int(re.search(r"共(\d+)页", pages_content)[1])
     return pages
Exemplo n.º 10
0
def download_hero_image(hero_list):
    """下载头像图片"""
    dir_name = "./herofile"
    for hero_element in hero_list:
        hero_name = hero_element["hero_name"]
        hero_url = hero_element["image_url"]

        headers = useragentutil.get_headers()
        response = requests.get(hero_url, headers=headers)
        image_content = response.content

        hero_path = dir_name + "/" + hero_name + "/1" + hero_name + ".jpg"
        #文件操作 写入
        with open(hero_path, "wb") as hero_file:
            hero_file.write(image_content)
        print("正在下载--(%S)--图片" % hero_name)
    print("所有图片头像已经下载成功")
Exemplo n.º 11
0
 def catch_company_info(self, temp_url):
     """提取公司简介信息"""
     try:
         company_response = requests.get(
             temp_url,
             headers=useragentutil.get_headers(),
             proxies=proxypool.get_proxy())
         company_html_content = company_response.content.decode("gbk")
         # 提取数据
         company_parser = lxml.html.etree.HTML(company_html_content)
         company_infos = "".join(
             company_parser.xpath(
                 "//div[@class='con_txt']//text()")).strip().replace(
                     " ", "")
     except Exception:
         company_infos = "暂无数据"
     return company_infos
Exemplo n.º 12
0
def download_img(imgs_info, imgs_info_saved, author_name, author_ip,
                 file_update_interval):
    """
    :param imgs_info: 图片信息
    :param imgs_info_saved: 已完成保存的图片信息
    :param author_name: 作者名
    :param author_ip 作者的lofter三级域名
    :return:无
    """
    author_name_in_filename = author_name.replace("/", "&").replace("|", "&").replace("\\", "&"). \
        replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?"). \
        replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")")
    dir_path = "./dir/img/" + author_name_in_filename + "[" + author_ip + "]"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    save_num = len(imgs_info_saved)
    for img_index in range(len(imgs_info)):
        pic_name = imgs_info[0]["pic_name"]
        pic_name_in_filename = pic_name.replace("/", "&").replace("|", "&").replace("\r", " ").replace(
            "\\", "&").replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \
            .replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")").strip()

        pic_url = imgs_info[0]["img_url"]
        img_path = dir_path + "/" + pic_name_in_filename
        print("获取图片 %s" % (pic_url))
        content = requests.get(pic_url,
                               headers=useragentutil.get_headers()).content
        with open(img_path, "wb") as op:
            op.write(content)

        save_num += 1
        imgs_info_saved.append(imgs_info[0])
        del imgs_info[0]

        print("图片已保存,共保存图片%d (本次运行已保存%d),余%d" %
              (save_num, img_index + 1, len(imgs_info)))

        if img_index % file_update_interval == 0 or len(imgs_info) == 0:
            file_update("./dir/author_img_file/imgs_info.json", imgs_info)
            file_update("./dir/author_img_file/imgs_info_saved.json",
                        imgs_info_saved)
            time.sleep(1)
            print("文件刷新")
    with open("./dir/author_img_file/imgs_info.json", "w") as op:
        op.write("finished")
Exemplo n.º 13
0
def download_img(imgs_info):
    dir_path = "./dir/img/this"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    num = 0
    list_len = len(imgs_info)
    for img_info in imgs_info:
        pic_name = img_info["pic_name"]
        pic_url = img_info["img_url"]
        img_path = dir_path + "/" + pic_name
        print("获取图片 %s,%s" % (pic_url, pic_name))
        content = requests.get(pic_url, headers=useragentutil.get_headers()).content
        with open(img_path, "wb") as op:
            op.write(content)
        num += 1
        list_len -= 1
        print("图片已保存,共保存图片%d ,余%d" % (num, list_len))

        if num % 8 == 0:
            time.sleep(1)
Exemplo n.º 14
0
    def catch_work_info(self, temp_url):
        """提取工作职责信息"""
        try:
            work_response = requests.get(temp_url,
                                         headers=useragentutil.get_headers(),
                                         proxies=proxypool.get_proxy())
            work_html_content = work_response.content.decode("utf-8")

            work_parser = lxml.html.etree.HTML(work_html_content)
            work_infos = work_parser.xpath(
                "//div[@class='tabs_box pllist active']//ul[@class='clearfix']/li"
            )
            url = []
            for li in work_infos:
                url.append(li.xpath("./a/@href"))

        except Exception:
            work_infos = "暂无数据"

        return url
Exemplo n.º 15
0
def run(author_url, start_time, end_time, target_titles, merger_chapter):
    author_page_parse = etree.HTML(
        requests.get(
            author_url,
            headers=useragentutil.get_headers()).content.decode("utf-8"))
    # id是是获取归档页面需要的一个参数,纯数字;ip是作者在lofter的三级域名,由作者注册时设定
    author_id = author_page_parse.xpath(
        "//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1]
    author_ip = re.search(r"http[s]*://(.*).lofter.com/", author_url).group(1)

    try:
        author_name = author_page_parse.xpath("//title//text()")[0]
    except:
        author_name = input("解析作者名时出现异常,请手动输入\n")
    archive_url = author_url + "dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr"

    query_num = 50
    data = l4_author_img.make_data(author_id, query_num)
    head = l4_author_img.make_head(author_url)

    print("作者名%s,lofter ip %s,主页链接 %s" % (author_name, author_ip, author_url))
    path = "./dir/article"
    arthicle_path = "./dir/article/{}".format(author_name)

    blog_infos = parse_archive_page(archive_url, head, data, author_url,
                                    query_num, start_time, end_time,
                                    target_titles, merger_chapter)
    if not blog_infos:
        print("作者主页中无带标题的博客,无需爬取,程序退出")
        exit()
    for x in [path, arthicle_path]:
        if not os.path.exists(x):
            os.makedirs(x)
    if target_titles and merger_chapter:
        save_chapter(blog_infos, target_titles, author_name, author_ip)

    else:
        save_file(blog_infos, author_name, author_ip)
        # print("end")
    print("运行结束")
Exemplo n.º 16
0
def main():
    '''对IP进行处理,好留坏丢'''
    proxy_ip_datas = get_proxy_fromjson()
    url = "https://www.sohu.com/"
    value_proxy_list = []
    for proxy in proxy_ip_datas:
        try:
            proxy_response = requests.get(url,
                                          headers=useragentutil.get_headers(),
                                          proxies=proxy)
            if proxy_response.status_code == 200:
                value_proxy_list.append(proxy)
                #with open("./ip_path/proxypool.json","w",encoding="utf-8") as file
                json.dump(value_proxy_list,
                          open("./ip_path/proxypool.json",
                               "w",
                               encoding="utf-8"),
                          ensure_ascii=False,
                          indent=2)
                print("正在处理ip:{}".format(proxy))
        except Exception:
            print("异常ip:{}".format(proxy))
Exemplo n.º 17
0
# coding:utf-8
"""
思路分析:
(1)爬取整个新浪网的页面数据内容;
(2)把爬取下来的数据内容保存到一个文件中。
"""
import requests
import useragentutil

sina_url = "https://www.sina.com.cn/"
# 爬取整个新浪网的页面数据内容
headers = useragentutil.get_headers()
# print(headers)
response = requests.get(sina_url, headers=headers)
# 获得内容
# html_content = response.text
html_content = response.content.decode("utf-8")
# print(html_content)

# 把爬取下来的数据内容保存到一个文件中
writer = open("./file/sina.html", "w", encoding="utf-8")
writer.write(html_content)
writer.close()
print("新浪网页面数据已保存成功!")
Exemplo n.º 18
0
def make_data(mode, url=""):
    """
    :param mode: 模式,支持的模式有share like1 like2 tag
    :param url:  生成data需要用到url,share like1 需要的是用户主页的url,tag需要的是tag页的url。like2不会用到,因为信息在cookies种
    :return:
    """
    if (mode == "like1" or mode == "share" or mode == "tag") and url == "":
        print("{}模式生成data需要url参数".format(mode))
        return {}

    base_data = {
        'callCount': '1',
        'httpSessionId': '',
        'scriptSessionId': '${scriptSessionId}187',
        'c0-id': '0',
        "batchId": "472351"
    }
    get_num = 100
    got_num = 0
    if mode == "share" or mode == "like1":
        userId = ""
        user_page_parse = etree.HTML(
            requests.get(
                url,
                headers=useragentutil.get_headers()).content.decode("utf-8"))
        try:
            userId = user_page_parse.xpath(
                "//body/iframe[@id='control_frame']/@src")[0].split(
                    "blogId=")[1]
        except:
            print("\n链接与模式不匹配")
            exit()
        data_parme = {
            'c0-scriptName': 'BlogBean',
            "c0-methodName": "",
            'c0-param0': 'number:' + str(userId),
            'c0-param1': 'number:' + str(get_num),
            'c0-param2': 'number:' + str(got_num),
            'c0-param3': 'string:'
        }
        if mode == "like1":
            data_parme["c0-methodName"] = "queryLikePosts"
        else:
            data_parme["c0-methodName"] = "querySharePosts"

    elif mode == "like2":
        data_parme = {
            "c0-scriptName": "PostBean",
            "c0-methodName": "getFavTrackItem",
            "c0-param0": "number:" + str(get_num),
            "c0-param1": "number:" + str(got_num),
        }
    elif mode == "tag":
        # 参数8要拿时间戳
        url_search = re.search("http[s]{0,1}://www.lofter.com/tag/(.*?)/(.*)",
                               url)
        type = url_search.group(2)
        if type == "":
            type = "new"
        data_parme = {
            'c0-scriptName': 'TagBean',
            'c0-methodName': 'search',
            'c0-param0': 'string:' + url_search.group(1),
            'c0-param1': 'number:0',
            'c0-param2': 'string:',
            'c0-param3': 'string:' + type,
            'c0-param4': 'boolean:false',
            'c0-param5': 'number:0',
            'c0-param6': 'number:' + str(get_num),
            'c0-param7': 'number:' + str(got_num),
            'c0-param8': 'number:' + str(int(time.time() * 1000)),
            'batchId': '870178'
        }
    else:
        print("data-模式错误")
        data_parme = {}
    data = {**base_data, **data_parme}
    return data
Exemplo n.º 19
0
def save_img(imgs_info, file_path, img_save_info, classify_by_tag, prior_tags,
             agg_non_prior_tag, print_level):
    if not os.path.exists(file_path + "/img"):
        os.makedirs(file_path + "/img")

    if classify_by_tag and prior_tags:
        for x in ["prior", "other"]:
            if not os.path.exists(file_path + "/img/" + x):
                os.makedirs(file_path + "/img/" + x)

    # saved_index是已经保存完成的数量
    count = 0
    saved_num = img_save_info["已保存"]
    for img_info in imgs_info:
        # 跳到上次的保存进度
        if count < saved_num:
            count += 1
            continue
        print_end = lambda x: "\n" if x == 1 else "   "
        print("正在保存:博客序号{} {}".format(count + 1, img_info["url"]),
              end=print_end(print_level))
        for img_url in img_info["img urls"]:
            is_gif = re.findall("gif", img_url)
            is_png = re.findall("png", img_url)
            if is_gif:
                img_type = "gif"
            elif is_png:
                img_type = "png"
            else:
                img_type = "jpg"

            if print_level:
                print("正在保存图片 {} ".format(img_url), end="\t\t")
            # 检查图片是否是站内图
            # re_url = re.findall('http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*', img_url)
            re_url = re.findall(
                'http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*', img_url)

            if not re_url:
                print("\n图片 {} 不是lofter站内图 ".format(img_url), end="\t")
            try:
                img = requests.get(img_url,
                                   headers=useragentutil.get_headers()).content

            except:
                print("保存失败,请尝试手动保存")
                continue
            filename = img_info["author name in filename"] + "[" + img_info[
                "author ip"] + "] " + img_info["public time"] + "." + img_type

            # 根据自动整理选项选择保存路径
            key_tag_path = img_info["key tag"].replace("/", "&").replace("|", "&").replace("\\", "&") \
                .replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \
                .replace("*", "·").replace("(", "(").replace(")", ")")
            # 没有启动tag分类
            if not classify_by_tag:
                img_path = file_path + "/img"
            # 启动tag分类,未启用优先tag
            elif classify_by_tag and not prior_tags:
                if not os.path.exists(file_path + "/img/" + key_tag_path):
                    os.makedirs(file_path + "/img/" + key_tag_path)
                img_path = file_path + "/img/" + key_tag_path
            # 启用tag分类,启用优先tag
            else:
                # key tag在优先tag中
                if img_info["key tag"] in prior_tags:
                    img_path = file_path + "/img/prior/" + key_tag_path
                # tag不在优先tag
                else:
                    # tag不在优先tag中,启用非优先tag聚合
                    if agg_non_prior_tag:
                        img_path = file_path + "/img/other"
                    # tag不在优先tag中,未启用非优先tag聚合
                    else:
                        img_path = file_path + "/img/other/" + key_tag_path
            # 文件名查重,保存
            filename = filename_check(filename, img, img_path, img_type)
            if not os.path.exists(img_path):
                os.makedirs(img_path)
            write_img(img, filename, img_path)
            if print_level:
                print("保存完成")
        if not print_level:
            print("保存完成")
        else:
            print("\n" + "-----------" * 10)
        # 保存数+1,每7条博客刷新一次文件
        saved_num += 1
        count += 1
        if saved_num % 7 == 0 or saved_num == len(imgs_info):
            img_save_info["已保存"] = saved_num
            with open(file_path + "/img_save_info.json", "w",
                      encoding="utf-8") as i_op1:
                i_op1.write(
                    json.dumps(img_save_info, indent=4, ensure_ascii=False))
Exemplo n.º 20
0
def save_article(articles_info, file_path, classify_by_tag, prior_tags,
                 agg_non_prior_tag, save_img_in_text, print_level):
    # 在启用按tag分类时,先建立优先prior和other文件夹
    if classify_by_tag and prior_tags:
        for x in ["prior", "other"]:
            if not os.path.exists(file_path + "/article/" + x):
                os.makedirs(file_path + "/article/" + x)
    count = 0
    is_tag_null = lambda x: x if x != "" else "无"

    for article_info in articles_info:
        # 文档信息整理
        article_head = article_info["title"] + " by " + article_info["author name"] + "[" + article_info[
            "author ip"] + "]" + "\n发表时间:" + article_info["public time"] + "\n原文连接:" + article_info["url"] \
                       + "\ntags:" + is_tag_null(", ".join(article_info["tags"]))

        article_tail = get_tail(article_info)
        article = article_head + "\n\n\n" + article_info[
            "content"] + "\n\n\n" + article_tail
        filename_title = article_info["title in filename"]
        filename = filename_title + " by " + article_info[
            "author name in filename"] + ".txt"
        # 提示输出
        count += 1
        if print_level:
            try:
                print("保存:文章序号{} {} 原文链接:{}".format(
                    articles_info.index(article_info) + 1, filename,
                    article_info["url"]),
                      end="\t\t")
            except:
                print(
                    print("保存:文章序号{} 原文链接:{}".format(
                        articles_info.index(article_info) + 1,
                        article_info["url"]),
                          end="\t\t"))
        else:
            if count % 20 == 0 or count == len(articles_info) or count == 0:
                try:
                    print("保存进度 {}/{}\t\t{}".format(count, len(articles_info),
                                                    filename),
                          end="\t\t")
                except:
                    print("保存进度 {}/{}\t\t".format(count, len(articles_info)),
                          end="\t\t")

        # 文件路径判断
        # 没有启动tag分类
        key_tag_path = article_info["key tag"].replace("/", "&").replace("|", "&").replace("\\", "&") \
            .replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \
            .replace("*", "·").replace("(", "(").replace(")", ")")
        if not classify_by_tag:
            article_path = file_path + "/article"
        # 启动tag分类,未启用优先tag
        elif classify_by_tag and not prior_tags:
            article_path = file_path + "/article/" + key_tag_path
        # 启用tag分类,启用优先tag
        else:
            # key tag在优先tag中
            if article_info["key tag"] in prior_tags:
                article_path = file_path + "/article/prior/" + key_tag_path
            # tag不在优先tag
            else:
                # tag不在优先tag中,启用非优先tag聚合
                if agg_non_prior_tag:
                    article_path = file_path + "/article/other"
                # tag不在优先tag中,未启用非优先tag聚合
                else:
                    article_path = file_path + "/article/other/" + key_tag_path

        # 如果文件夹不存在,建立文件夹
        if not os.path.exists(article_path):
            os.makedirs(article_path)
        # 保存
        write_text(article, filename, article_path)
        # 保存文章中的图片
        if save_img_in_text:
            if article_info["illustration"]:
                for img_url in article_info["illustration"]:
                    if print_level:
                        print("准备保存文章中的图片 {}".format(img_url), end="\t\t")
                    img_name = filename_title + " by " + article_info[
                        "author name in filename"] + ".jpg"
                    img = requests.get(
                        img_url, headers=useragentutil.get_headers()).content
                    img_name = filename_check(img_name, img, article_path,
                                              "jpg")
                    write_img(img, img_name, article_path)
                    if print_level:
                        print("保存完成")

        # 输出
        if print_level:
            print("保存完成")
        else:
            if count % 20 == 0 or count == len(articles_info):
                print("保存完成")
Exemplo n.º 21
0
def get_parse(url):
    content = requests.get(url, headers=useragentutil.get_headers()).content
    parse = etree.HTML(content)
    return parse
Exemplo n.º 22
0
def parse_blogs_info(blogs_info, parsed_blogs_info, author_name, author_ip,
                     target_tags, tags_filter_mode, file_update_interval):
    """
    :param blogs_info: 未解析的博客信息
    :param parsed_blogs_info: 已解析完的博客信息
    :param author_name: 作者名
    :param author_ip 作者的lofter三级域名
    :param target_tags 保留带有哪些tag的博客
    :param tags_filter_mode 博客过滤方式
    :return: 无

    解析完成的图片信息会写入./dir/imgs_info.json
    """
    global pre_page_last_img_info
    imgs_info = get_file_contetn(
        "./dir/author_img_file/imgs_info.json")  # 上次获取到的图片信息
    parsed_num = len(parsed_blogs_info)

    # 循环len(blogs_info)次,每次解析blogs_info的第一元素,解析完后删除,定时将blogs_info刷新到文件中,以保证中途失败后能继续爬取
    for blog_num in range(len(blogs_info)):
        blog_url = blogs_info[0]["blog_url"]
        img_time = blogs_info[0]["time"]
        print("博客 %s 开始解析" % blog_url, end="  ")
        content = requests.get(
            blog_url,
            headers=useragentutil.get_headers()).content.decode("utf-8")

        blog_tags = re.findall(r'"http[s]{0,1}://.*?.lofter.com/tag/(.*?)"',
                               content)
        blog_tags = list(
            map(lambda x: unquote(x, "utf-8").replace("\xa0", " "), blog_tags))

        if target_tags:
            if not tag_filter(blog_tags, target_tags, tags_filter_mode):
                del blogs_info[0]
                parsed_num += 1
                print("该篇博客被过滤掉,剩余%d" % (len(blogs_info)))
                # 文件刷新
                if (blog_num % file_update_interval
                        == 0) or len(blogs_info) == 0:
                    file_update("./dir/author_img_file/blogs_info.json",
                                blogs_info)
                    file_update("./dir/author_img_file/imgs_info.json",
                                imgs_info)
                    file_update("./dir/author_img_file/blogs_info_parsed.json",
                                parsed_blogs_info)
                    print("文件刷新")
                    time.sleep(random.randint(1, 2))
                continue

        # 不同作者主页会有不同页面结构,所以没有使用xpath而是直接用正则匹配出所有的图片链接,其中会包括一些评论头像和推荐图片
        # 大概9月前的图片链接格式是nosdn,9月之后是imglf
        # imgs_url = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)"', content)

        imgs_url = re.findall(
            '"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)"', content)

        # 过滤后为空说明没有获取到有效图片
        if not img_fliter(imgs_url, "img"):
            print("使用旧正则表达式", end="\t")
            imgs_url = re.findall(
                '"(http[s]{0,1}://imglf\d.nosdn\d*.[0-9]{0,3}\d.net.*?)"',
                content)

        # 过滤图片链接
        imgs_url = img_fliter(imgs_url, "img")

        # 判断跟上一博客的发表日期是否相同,如果是的话文件下标接上次的增加
        img_index = 0
        if img_time == pre_page_last_img_info["last_file_time"]:
            img_index = pre_page_last_img_info["index"]

        # 整理图片信息,用于下一步保存
        count = 0
        for img_url in imgs_url:
            # 判断图片类型是jpg png还是gif
            is_gif = re.findall("gif", img_url)
            is_png = re.findall("png", img_url)
            if is_gif:
                img_type = "gif"
            elif is_png:
                img_type = "png"
            else:
                img_type = "jpg"

            img_info = {}
            img_info["img_url"] = img_url
            img_index += 1
            img_info[
                "pic_name"] = author_name + "[" + author_ip + "] " + img_time + "(" + str(
                    img_index) + ")." + img_type
            imgs_info.append(img_info)
            count += 1

            # 用于验证下一条博客是不是同一天发的
            pre_page_last_img_info["last_file_time"] = img_time
            pre_page_last_img_info["index"] = img_index

        # next_some_time用于判断跟下一篇博客发布时间是否相同,相同则不能刷新文件,防止相同时程序中断pre_page_last_img_info数据无法传递
        try:
            if blogs_info[0]["time"] == blogs_info[1]["time"]:
                next_some_time = 1
            else:
                next_some_time = 0
        except:
            next_some_time = 0

        parsed_num += 1
        parsed_blogs_info.append(blogs_info[0])
        del blogs_info[0]
        print(
            "解析完成,获取到图片链接%d,总获取图片数%d,已解析完成%d个链接(本次运行中已解析%d个链接),剩余%d" %
            (count, len(imgs_info), parsed_num, blog_num + 1, len(blogs_info)))

        # print(imgs_url)
        # print("--------"*10)

        # 按文件数目为间隔,将未解析博客、解析出的图片信息、已解析的博客 刷新到文件中
        if (blog_num % file_update_interval == 0
                and not next_some_time) or len(blogs_info) == 0:
            file_update("./dir/author_img_file/blogs_info.json", blogs_info)
            file_update("./dir/author_img_file/imgs_info.json", imgs_info)
            file_update("./dir/author_img_file/blogs_info_parsed.json",
                        parsed_blogs_info)
            print("文件刷新")
            time.sleep(random.randint(1, 2))

    with open("./dir/author_img_file/blogs_info.json", "w") as op:
        op.write("finished")
Exemplo n.º 23
0
def run(author_url, start_time, end_time, target_tags, tags_filter_mode,
        file_update_interval):
    author_page_parse = etree.HTML(
        requests.get(
            author_url,
            headers=useragentutil.get_headers()).content.decode("utf-8"))
    # id是是获取归档页面需要的一个参数,纯数字;ip是作者在lofter的三级域名,由作者注册时设定
    author_id = author_page_parse.xpath(
        "//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1]
    author_ip = re.search(r"http[s]*://(.*).lofter.com/", author_url).group(1)

    try:
        author_name = author_page_parse.xpath("//title//text()")[0]
    except:
        author_name = input("解析作者名时出现异常,请手动输入\n")
    archive_url = author_url + "dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr"

    query_num = 50
    data = make_data(author_id, query_num)
    head = make_head(author_url)

    try:
        print("作者名%s,lofter ip %s,主页链接 %s" %
              (author_name, author_ip, author_url))
    except:
        print("作者名中有异常符号,无法显示,lofter ip %s,主页链接 %s" % (author_ip, author_url))

    if target_tags:
        print("tag过滤已经打开,仅保存含有tag中包含%s的图片," %
              (" [" + ",".join(target_tags) + "] "),
              end="")
        if tags_filter_mode == "in":
            print("没有tag的图片将会保留")
        else:
            print("没有tag的图片将不会保留")
    else:
        print("tag过滤未打开,将保存所有图片")
    print("tag过滤和模式参数为:target_tags,tags_filter_mode,请根据需求自行修改")
    start_command = input("输入ok以启动程序\n")
    if start_command != "ok":
        print("程序退出")
        exit()

    deal_file("init")
    dir_path = "./dir/author_img_file"
    # 判断博客解析进度
    if is_file_in(dir_path + "/blogs_info.json") == "finished":
        print("所有博客已解析完毕,跳转至图片下载")
    elif is_file_in(dir_path + "/blogs_info.json"):
        blogs_info = get_file_contetn(dir_path + "/blogs_info.json")
        parsed_blogs_info = get_file_contetn(dir_path +
                                             "/blogs_info_parsed.json")
        print("读取到上次运行保存的博客信息:未解析博链接%d条,已解析链接%d条,接上次继续运行" %
              (len(blogs_info), len(parsed_blogs_info)))
        parse_blogs_info(blogs_info, parsed_blogs_info, author_name, author_ip,
                         target_tags, tags_filter_mode, file_update_interval)
    else:
        print("开始获取归档页面数据,链接 %s (不能直接点开)" % archive_url)
        blog_infos = parse_archive_page(url=archive_url,
                                        data=data,
                                        header=head,
                                        author_url=author_url,
                                        query_num=query_num,
                                        start_time=start_time,
                                        end_time=end_time)
        parsed_blogs_info = get_file_contetn(dir_path +
                                             "/blogs_info_parsed.json")
        file_update(dir_path + "/blogs_info.json", blog_infos)
        print("归档页面数据保存完毕,开始解析博客页面")
        parse_blogs_info(blog_infos, parsed_blogs_info, author_name, author_ip,
                         target_tags, tags_filter_mode, file_update_interval)
        print("博客解析完毕,开始图片下载")
    # 判断图片保存进度
    if is_file_in(dir_path + "/imgs_info.json") == "finished":
        print("该作者首页的所有图片已保存完毕,无需操作")
    else:
        imgs_info = get_file_contetn(dir_path + "/imgs_info.json")
        imgs_info_saved = get_file_contetn(dir_path + "/imgs_info_saved.json")
        download_img(imgs_info, imgs_info_saved, author_name, author_ip,
                     file_update_interval)
        print("所有图片保存完毕")

    deal_file("del")
    print("程序运行结束")
Exemplo n.º 24
0
def save_file(blog_infos, author_name, author_ip, get_comm):
    all_file_name = []
    print("开始保存文章内容")
    # 拿一篇出来,测试匹配模板
    first_parse = get_parse(blog_infos[0]["url"])
    template_id = parse_template.matcher(first_parse)
    print("文字匹配模板为模板{}".format(template_id))
    if template_id == 0:
        print("文字匹配模板是根据作者主页自动匹配的,模板0是一个匹配度比较广的模板,使用模板0说明没有其他的模板匹配成功,除了文章主体之外可能会爬到一些其他的内容,也有可能出现文章部分内容缺失")
        input1 = input("输入ok确定继续爬取,或输入任意其他文字退出\n")
        if not input1 == "ok":
            print("退出")
            exit()
    # 开始保存

    arthicle_path = "./dir/article/{}".format(author_name)
    for blog_info in blog_infos:
        # 信息提取
        title = blog_info["title"]
        print_title = blog_info["print_title"]
        public_time = blog_info["time"]
        url = blog_info["url"]
        blog_type = blog_info["blog_type"]
        print("准备保存:{} ,原文连接: {} ".format(print_title, url), end="    ")

        # 文件头
        if blog_info["blog_type"] == "article":
            article_head = "{} by {}[{}]\n发表时间:{}\n原文链接: {}".format(title, author_name, author_ip, public_time, url)
        else:
            article_head = "{}\n原文链接: {}".format(title, url)
        # 正文
        content = requests.get(url, headers=useragentutil.get_headers()).content
        parse = etree.HTML(content)
        article_content = parse_template.get_content(parse, template_id, title, blog_type)
        comm_list = []
        # 评论
        if get_comm:

            referer_url = parse.xpath("//div[@class='main comment']//iframe/@src")[0]
            param0 = re.search("pid=(\d+)&bid=", referer_url).group(1)
            number1 = 50
            number2 = 0
            comm_url = "https://www.lofter.com/dwr/call/plaincall/PostBean.getPostResponses.dwr"
            headers = {
                'Host': 'www.lofter.com',
                'Origin': 'https://www.lofter.com',
                'Referer': "https:" + referer_url,
                'Accept-Encoding': 'gzip, deflate',
            }
            all_comm_str = ""
            while True:
                comm_data = {"callCount": "1",
                             "scriptSessionId": "${scriptSessionId}187",
                             "httpSessionId": "",
                             "c0-scriptName": "PostBean",
                             "c0-methodName": "getPostResponses",
                             "c0-id": "0",
                             "c0-param0": "number:{}".format(param0),
                             "c0-param1": "number:{}".format(number1),
                             "c0-param2": "number:{}".format(number2),
                             "batchId": "334950"}
                number2 += number1
                comm_response = requests.post(comm_url, data=comm_data, headers=headers)
                comm_text = comm_response.content.decode("utf-8")
                all_comm_str += comm_text
                comm_infos = comm_text.split("anonymousUser")[1:]
                if not comm_infos:
                    break

                for comm_info in comm_infos:
                    # 获取的信息里每条评论有个s\d+编号
                    comm_sid = re.search("(s\d+)\.appVersion", comm_info).group(1)
                    # 评论内容
                    comm_content = re.search(comm_sid + '\.content="(.*?)";', comm_info).group(1) \
                        .encode('utf8', errors="replace").decode('unicode_escape')
                    # 评论发表时间
                    comm_publish_time = re.search(comm_sid + '\.publishTime=(\d+);', comm_info).group(1)
                    public_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(int(comm_publish_time) / 1000))

                    # 发表者信息
                    publisher_sid = re.search(comm_sid + "\.publisherMainBlogInfo=(.*?);", comm_info).group(1)
                    # 昵称
                    re_publisher_nickname = re.search(publisher_sid + '\.blogNickName="(.*?)";', comm_info)
                    if not re_publisher_nickname:
                        re_publisher_nickname = re.search(publisher_sid + '\.blogNickName="(.*?)";', all_comm_str)
                    publisher_nickname = re_publisher_nickname.group(1) \
                        .encode('utf8', errors="replace").decode('unicode_escape')
                    # 用户名
                    re_publisher_blogname = re.search(publisher_sid + '\.blogName="(.*?)";', comm_info)
                    if not re_publisher_blogname:
                        re_publisher_blogname = re.search(publisher_sid + '\.blogName="(.*?)";', all_comm_str)
                    publisher_blogname = re_publisher_blogname.group(1) \
                        .encode('utf8', errors="replace").decode('unicode_escape')

                    # 回复
                    reply_blogsid = re.search(comm_sid + "\.replyBlogInfo=(.*?);", comm_info).group(1)
                    if not reply_blogsid == "null":
                        re_reply_nickname = re.search(reply_blogsid + '\.blogNickName="(.*?)";', comm_info)
                        if not re_reply_nickname:
                            re_reply_nickname = re.search(reply_blogsid + '\.blogNickName="(.*?)";', all_comm_str)
                        reply_nickname = re_reply_nickname.group(1).encode('utf8', errors="replace").decode(
                            'unicode_escape')
                        re_reply_blogname = re.search(reply_blogsid + '\.blogName="(.*?)";', comm_info)
                        if not re_reply_blogname:
                            re_reply_blogname = re.search(reply_blogsid + '\.blogName="(.*?)";', all_comm_str)
                        reply_blogname = re_reply_blogname.group(1)
                    else:
                        reply_nickname = ""
                        reply_blogname = ""
                    if reply_nickname:
                        comm = "{} {}[{}] 回复 {}[{}]:{}".format(public_time, publisher_nickname, publisher_blogname,
                                                               reply_nickname, reply_blogname, comm_content)
                    else:
                        comm = "{}  {}[{}]:{}".format(public_time, publisher_nickname, publisher_blogname, comm_content)
                    comm_list.append(comm)
        comm_list = comm_list[::-1]
        # 文件尾,文章中插,的图片
        # 匹配新格式

        illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)"', content.decode("utf-8"))

        # 过滤后为空说明没有获取到有效图片
        if not l4_author_img.img_fliter(illustration, blog_type):
            illustration = re.findall('"(http[s]{0,1}://imglf\d.nosdn\d*.[0-9]{0,3}\d.net.*?)"',
                                      content.decode("utf-8"))
        illustration = l4_author_img.img_fliter(illustration, blog_type)
        '''
        illustration = re.findall('(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)\?', tmp_str)
        if illustration == []:
            # 匹配旧格式
            illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)\?',
                                      "\n".join(img_src))
        '''
        if illustration:
            article_tail = "博客中包含的图片:\n" + "\n".join(illustration)
        else:
            article_tail = ""

        # 全文
        article = article_head + "\n\n\ n\n" + article_content + "\n\n\n" + article_tail + \
                  ("\n\n\n-----评论-----\n\n" + "\n".join(comm_list) if comm_list else "")
        article = article.encode("utf-8", errors="replace").decode("utf-8", errors="replace")

        # 文件名
        if blog_info["blog_type"] == "article":
            # 文章用 文章名by作者,替换掉非法字符
            file_name = "{} by {}.txt".format(title, author_name)
            file_name = file_name.replace("/", "&").replace("|", "&").replace("\\", "&").replace("<", "《") \
                .replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?").replace("*", "·"). \
                replace("\n", "").replace("(", "(").replace(
                ")", ")").replace(",", ",")
            file_name = l13_like_share_tag.filename_check(file_name, article, arthicle_path, "txt")
        else:
            # 文本要检查是否重名
            file_name = l13_like_share_tag.filename_check(title + ".txt", article, arthicle_path, "txt")

        # 写入
        with open(arthicle_path + "/" + file_name, "w", encoding="utf-8") as op:
            op.write(article)
        try:
            print("{}  保存完毕".format(file_name))
        except:
            print("{}  保存完毕".format(print_title))
        all_file_name.append(file_name)
    return all_file_name