示例#1
0
def get_urls(all_author):
    """
    获取书法家的所有文章
    :param all_author:
    :return:
    """
    all_author_dic = {}
    for name in all_author:
        try:
            page = 1
            url = all_author[name] + "{index}.html".format(index=page)
            # print(url)
            s = get_selector.get_selector(send_request.send_requests(url))
            result = s.xpath('//div[@class="navBtn font1_2"]/text()')[0]
            page_max = result[3:result.index('页')]
            result_dic = {}
            for er_page in range(int(page_max)):
                try:
                    url = all_author[name] + "{index}.html".format(
                        index=er_page + 1)
                    # print("doing", name, "page{page}".format(page=er_page + 1))
                    s = get_selector.get_selector(
                        send_request.send_requests(url))
                    result_url = s.xpath('//li//div[@class="b"]//h4/a/@href')
                    result_title = [None] * len(result_url)
                    for i in range(len(result_url)):
                        try:
                            s = get_selector.get_selector(
                                send_request.send_requests(
                                    base_url + result_url[i].strip('.')))
                            # print("正在打开 {url_} 获取title".format(url_=base_url + result_url[i].strip('.')))
                            result_title[i] = s.xpath(
                                '//div[@class="a"]/h1/text()')[0]
                            if "�" in result_title[i]:
                                result_title[i] = result_title.replace(
                                    "�", "頫")
                            if "\\" in result_title[i]:
                                result_title[i] = result_title[i].replace(
                                    "\\", "")
                            result_dic[result_title[
                                i]] = base_url + result_url[i].strip('.')
                        except:
                            pass
                    print("done", name, "page{page}".format(page=er_page + 1))
                except Exception as e:
                    print("error:获取这一篇文章出错", url, e)
            # print(result_dic)
            all_author_dic[name] = result_dic
        except Exception as e:
            print("error:获取这一页出错了", e)
    return all_author_dic
示例#2
0
def get_img_info(dic):
    """
    获取图片并下载
    :return:
    """
    for name in dic:
        path = create_dir(r"/root/bao", name)  #先创建任务文件夹
        print("***正在弄***", name)
        for title in dic[name]:
            try:
                all_imgs = []
                # print("正在下载", title, dic[name][title])
                file_path = create_dir(path, title)  # 再创建文章文件夹
                s = get_selector.get_selector(
                    send_request.send_requests(dic[name][title]))
                result_page = s.xpath('//td//a[@rel="nofollow"]/text()')
                img_list = s.xpath('//div[@id="newsContent"]//img/@src')
                if img_list is not []:
                    for i in range(len(img_list)):
                        if img_list[i][0] is '.':
                            img_list[i] = base_url + img_list[i].lstrip('.')
                all_imgs.extend(img_list)
                if result_page is not []:
                    max_page = int(result_page[-2].strip('.'))
                else:
                    max_page = 1
                if max_page > 1:
                    for er_page in range(max_page)[1:]:
                        url = dic[name][title].rstrip('.html')
                        url = url + "_{page}.html".format(page=er_page + 1)
                        s = get_selector.get_selector(
                            send_request.send_requests(url))
                        # print("获取图片链接", url)
                        img_list = s.xpath(
                            '//div[@id="newsContent"]//img/@src')
                        if img_list is not []:
                            for j in range(len(img_list)):
                                if img_list[j][0] is '.':
                                    img_list[
                                        j] = base_url + img_list[j].lstrip('.')
                        all_imgs.extend(img_list)
                count = 1
                for img in all_imgs:
                    filename = file_path + "\\{i}.jpg".format(i=count)
                    # print("downloading", filename)
                    down_load(img, filename)
                    count = count + 1
            except Exception as e:
                print("下载图片错了", e)
示例#3
0
def get_img_element_id(url):
    try:
        s = get_selector.get_selector(send_request.send_requests(url))
        id_list = s.xpath('//div[@id="beitie-imgs-container"]//div/@id')
        return id_list
    except Exception as e:
        print(e)
示例#4
0
def get_url(name):
    s = get_selector.get_selector(send_request.send_requests(search_url +
                                                             name))
    url = s.xpath('//li[@class="item clearfix"][1]//h4//a/@href')[0]
    url = base_url + url
    print(url)
    return url
示例#5
0
def get_info(url):
    """

    :param url:
    :return: 返回的a是营养价值,b是食用效果
    """
    dic = {"食材": "", "简介": "", "营养价值": "", "食用效果": ""}
    try:
        s = get_selector.get_selector(send_request.send_requests(url))
        # 储存功效信息
        fun = s.xpath(
            '//div[@class="fd_yygx"]//strong[1]/following-sibling::*')
        final_fun = []
        intro = s.xpath('//div[@class="n_sp_main_info"]//p/text()')
        dic["食材"] = s.xpath('//div[@class="fd_tit2"]//strong/text()')
        t = []
        jianjie = s.xpath('//div[@class="n_sp_main_info"]//p/text()')
        for i in jianjie:
            t.append(i.strip())
        dic["简介"] = t
        for i in range(len(fun)):
            info = s.xpath(
                '//div[@class="fd_yygx"]//strong[1]/following-sibling::*[%d]/text()'
                % (i + 1))
            if info is not None:
                final_fun = final_fun + info
            for m in range(len(final_fun)):
                final_fun[m] = final_fun[m].strip()
        for i in range(len(intro)):
            intro[i] = intro[i].strip()
        final_fun.remove('')
        for i in range(len(final_fun)):
            if "的挑选" in final_fun[i]:
                final_fun = final_fun[0:i]
                break
        for i in range(len(final_fun)):
            if "的存储方法" in final_fun[i]:
                final_fun = final_fun[0:i]
                break
        a = []
        b = []
        for i in range(len(final_fun)):
            if "的食用效果" in final_fun[i]:
                a = final_fun[1:i]
                b = final_fun[i + 1:]
        if len(a) is 0:
            print(final_fun[0])
            if "的营养价值" in final_fun[0]:
                print("doit")
                a = final_fun[1:]
        print(a)
        print(b)
        dic["营养价值"] = a
        dic["食用效果"] = b
        return dic
    except Exception as e:
        print(e)
示例#6
0
def get_info(url):
    dic = {}
    s = get_selector.get_selector(send_request.send_requests(url))
    name = s.xpath('//h2[@class="crumb"]/text()')[-1].strip().replace('/ ', '')
    calorie = s.xpath(
        '//span[@id="food-calory"]//span[@class="stress red1"]/text()')[0]
    img_url = s.xpath('//div[@class="food-pic pull-left"]//a/@href')[0]
    dic["new_name"] = name
    dic["calorie"] = calorie
    dic["img_url"] = img_url
    return dic
示例#7
0
def get_urls():
    """

    :return:
    """
    urls = []
    s = get_selector.get_selector(send_request.send_requests(aim_url))
    m = [8, 9, 10, 14, 15, 25]
    for i in m:
        url = s.xpath('//div[@class="fd_cont"]//dl[%d]//a/@href' % i)
        urls = urls + url
    return urls
def get_img_url(name):
    base_url = "http://so.meishi.cc/index.php?q="
    try:
        time.sleep(2)
        s = get_selector.get_selector(
            send_request.send_requests(base_url + name))
        img = s.xpath(
            '//div[@class="search2015_cpitem"][1]//a[@class="img"]//img/@src')
        print(name, img)
        return img[0]
    except Exception as e:
        print(e)
示例#9
0
def get_url(page_url):
    """
    该方法返回当前界面的所有url
    :param page_url: 某一页的网址
    :return:
    """
    try:
        s = get_selector.get_selector(send_request.send_requests(page_url))
        urls = s.xpath('//div[@class="item_all"]//p[@class="item_tit"]//a/@href')
        return urls
    except Exception as e:
        print(e)
        return []
示例#10
0
def get_urls(name_list):
    all_tie = {}
    for name in name_list:
        try:
            url = "/".join((base_url, "shufa", name))
            print("正在处理", name, url)
            s = get_selector.get_selector(send_request.send_requests(url))
            result = s.xpath('//div[@class="col-md-4 col-sm-6"]//a/@href')
            for i in result:
                all_tie[i.split('/')[-2]] = base_url + i
            print(all_tie)
        except Exception as e:
            print("字帖获取失败了", e)
    return all_tie
示例#11
0
def get_info(url):
    s = get_selector.get_selector(send_request.send_requests(url))
    info = s.xpath('//div[@class="content"]//dd[2]/following::dd//span/text()')
    info_dic = {}
    for i in range(int(len(info) / 2)):
        try:
            info_dic[info[i * 2]] = info[(i + 1) * 2 - 1]
        except:
            pass
    for ingredient in info_dic:
        try:
            float(info_dic[ingredient])
        except:
            info_dic[ingredient] = '-1'
    return info_dic
示例#12
0
def get_info(dic):
    for name in dic:
        try:
            s = get_selector.get_selector(send_request.send_requests(
                dic[name]))
            img1 = base_url + s.xpath('//div[@id="a0"]/img/@src')[0]
            img_urls = s.xpath(
                '//div[@id="a0"]/following-sibling::*/@data-original')
            for i in range(len(img_urls)):
                img_urls[i] = base_url + img_urls[i]
            img_urls.append(img1)
            dic[name] = img_urls
            print("get info done", name)
        except Exception as e:
            print("图片链接获取失败", e)
    return dic
示例#13
0
def get_info(url):
    """
    获取详细内容
    :param url: 菜品的url
    :return: 返回的字典
    """
    dic = {
        "菜名": "",
        "分类": "",
        "口味": "",
        "食材": "",
        "主要工艺": "",
        "制作时间": "",
        "做法": "",
        "图片url": "",
    }
    try:
        time.sleep(1)
        print("open", url)
        s = get_selector.get_selector(send_request.send_requests(url))
        dic["图片url"] = s.xpath('//div[@class="cp_headerimg_w"]//img/@src')[0]
        dic["菜名"] = s.xpath('//h1[@class="title"]//a/text()')[0]
        dic["分类"] = s.xpath('//dl[@class="yj_tags clearfix"]//a/text()')
        dic["主要工艺"] = s.xpath('//li[@class="w127"]//a/text()')[0]
        dic["口味"] = s.xpath('//li[@class="w127 bb0"]//a/text()')[0]
        dic["制作时间"] = s.xpath('//li[@class="w270 bb0 br0"]//div[@class="processing_w"]//a/text()')[0]
        zhuliao = s.xpath('//div[@class="c"]//h4/child::*/text()')
        fuliao = s.xpath('//div[@class="yl fuliao clearfix"]//ul[@class="clearfix"]/descendant::*/text()')
        cailiao = []
        for i in range(len(zhuliao)):
            if i % 2 == 0:
                temp = zhuliao[i] + ":" + zhuliao[i + 1]
                cailiao.append(temp)
        for i in range(len(fuliao)):
            if i % 2 == 0:
                temp = fuliao[i] + ":" + fuliao[i + 1]
                cailiao.append(temp)
        dic["食材"] = cailiao
        steps = s.xpath('//div[@class="editnew edit"]//div//p/text()')
        for i in range(len(steps)):
            steps[i] = str(i + 1) + steps[i]
        dic["做法"] = steps
        print(dic.get("菜名"), "done")
    except Exception as e:
        print("获取菜失败", e)
    return dic
示例#14
0
def get_urls(menu_url):
    """
    获取界面上的菜的url
    :param menu_url: 某类食谱
    :return: 该类食谱的所有菜的url
    """
    urls_list = []
    for i in range(11):
        time.sleep(1)
        try:
            url = menu_url + str(i + 1)
            print(url)
            s = get_selector.get_selector(send_request.send_requests(url))
            urls = s.xpath('//div[@class="listtyle1"]//a/@href')
            urls_list.extend(urls)
            print("page"+str(i), "done")
        except Exception as e:
            print("打开失败", e)
    return urls_list
示例#15
0
def get_info(url):
    """
    获取段落文字信息
    (这里有个单词记错了,comment应该是content,不过问题应该不大)
    :param url: 详情界面url
    :return:
    """
    try:
        s = get_selector.get_selector(send_request.send_requests(url))
        comment = s.xpath('//div[@id="articleContent"]/p/following-sibling::*/text()')
        title = s.xpath('//h1/text()')[0]
        for s in range(len(comment)):
            comment[s] = comment[s].strip()
            comment[s] = comment[s].lstrip('。')
            comment[s] = comment[s].lstrip('(')
            comment[s] = comment[s].lstrip('?')
            comment[s] = comment[s].lstrip(')')
            comment[s] = comment[s].lstrip('!')
            comment[s] = comment[s].lstrip(':')
            comment[s] = comment[s].lstrip('、')
            comment[s] = comment[s].lstrip(',')
            comment[s] = comment[s].lstrip('》')
            comment[s] = comment[s].lstrip('《')
            comment[s] = comment[s].lstrip(':')
        for i in comment:
            if i is '':
                comment.remove(i)
            if len(i) is 1:
                comment.remove(i)
        try:
            index = comment.index("本篇文章版权归民福康健康所有,未经许可,谢绝转载。")
            s = "\n".join(comment[:index])
        except:
            s = "\n".join(comment)
        print("正在获取", comment[0][:5], "...")
        dic = {}
        dic['title'] = title
        dic['content'] = s
        return dic
    except Exception as e:
        print(e)
        return ''
示例#16
0
def get_urls(list):
    """
    获取该网站所有的贴
    :param list: person_name_list
    :return:
    """
    urls = []
    for name in list:
        try:
            time.sleep(5)
            url = base_url + "/shufa/" + name
            print(name, "start")
            s = get_selector.get_selector(send_request.send_requests(url))
            result = s.xpath('//div[@class="caption ellipsis"]//a/@href')
            for i in range(len(result)):
                result[i] = base_url + result[i]
            print(name, "done")
            urls.extend(result)
        except Exception as e:
            print(e)
    print(urls)
    return urls
示例#17
0
def get_info():
    s = get_selector.get_selector(send_request.send_requests(base_url))
    name = s.xpath('//li//a[@rel="cpdq"]/text()')
    return name