def get_info(url): """ :param url: :return: 返回的a是营养价值,b是食用效果 """ dic = {"食材": "", "简介": "", "营养价值": "", "食用效果": ""} try: s = get_selector.get_selector(send_request.send_requests(url)) # 储存功效信息 fun = s.xpath( '//div[@class="fd_yygx"]//strong[1]/following-sibling::*') final_fun = [] intro = s.xpath('//div[@class="n_sp_main_info"]//p/text()') dic["食材"] = s.xpath('//div[@class="fd_tit2"]//strong/text()') t = [] jianjie = s.xpath('//div[@class="n_sp_main_info"]//p/text()') for i in jianjie: t.append(i.strip()) dic["简介"] = t for i in range(len(fun)): info = s.xpath( '//div[@class="fd_yygx"]//strong[1]/following-sibling::*[%d]/text()' % (i + 1)) if info is not None: final_fun = final_fun + info for m in range(len(final_fun)): final_fun[m] = final_fun[m].strip() for i in range(len(intro)): intro[i] = intro[i].strip() final_fun.remove('') for i in range(len(final_fun)): if "的挑选" in final_fun[i]: final_fun = final_fun[0:i] break for i in range(len(final_fun)): if "的存储方法" in final_fun[i]: final_fun = final_fun[0:i] break a = [] b = [] for i in range(len(final_fun)): if "的食用效果" in final_fun[i]: a = final_fun[1:i] b = final_fun[i + 1:] if len(a) is 0: print(final_fun[0]) if "的营养价值" in final_fun[0]: print("doit") a = final_fun[1:] print(a) print(b) dic["营养价值"] = a dic["食用效果"] = b return dic except Exception as e: print(e)
def get_info(url): dic = {} s = get_selector.get_selector(send_request.send_requests(url)) name = s.xpath('//h2[@class="crumb"]/text()')[-1].strip().replace('/ ', '') calorie = s.xpath( '//span[@id="food-calory"]//span[@class="stress red1"]/text()')[0] img_url = s.xpath('//div[@class="food-pic pull-left"]//a/@href')[0] dic["new_name"] = name dic["calorie"] = calorie dic["img_url"] = img_url return dic
def get_urls(): """ :return: """ urls = [] s = get_selector.get_selector(send_request.send_requests(aim_url)) m = [8, 9, 10, 14, 15, 25] for i in m: url = s.xpath('//div[@class="fd_cont"]//dl[%d]//a/@href' % i) urls = urls + url return urls
def get_img_url(name): base_url = "http://so.meishi.cc/index.php?q=" try: time.sleep(2) s = get_selector.get_selector( send_request.send_requests(base_url + name)) img = s.xpath( '//div[@class="search2015_cpitem"][1]//a[@class="img"]//img/@src') print(name, img) return img[0] except Exception as e: print(e)
def get_url(page_url): """ 该方法返回当前界面的所有url :param page_url: 某一页的网址 :return: """ try: s = get_selector.get_selector(send_request.send_requests(page_url)) urls = s.xpath('//div[@class="item_all"]//p[@class="item_tit"]//a/@href') return urls except Exception as e: print(e) return []
def get_urls(name_list): all_tie = {} for name in name_list: try: url = "/".join((base_url, "shufa", name)) print("正在处理", name, url) s = get_selector.get_selector(send_request.send_requests(url)) result = s.xpath('//div[@class="col-md-4 col-sm-6"]//a/@href') for i in result: all_tie[i.split('/')[-2]] = base_url + i print(all_tie) except Exception as e: print("字帖获取失败了", e) return all_tie
def get_info(url): s = get_selector.get_selector(send_request.send_requests(url)) info = s.xpath('//div[@class="content"]//dd[2]/following::dd//span/text()') info_dic = {} for i in range(int(len(info) / 2)): try: info_dic[info[i * 2]] = info[(i + 1) * 2 - 1] except: pass for ingredient in info_dic: try: float(info_dic[ingredient]) except: info_dic[ingredient] = '-1' return info_dic
def get_info(dic): for name in dic: try: s = get_selector.get_selector(send_request.send_requests( dic[name])) img1 = base_url + s.xpath('//div[@id="a0"]/img/@src')[0] img_urls = s.xpath( '//div[@id="a0"]/following-sibling::*/@data-original') for i in range(len(img_urls)): img_urls[i] = base_url + img_urls[i] img_urls.append(img1) dic[name] = img_urls print("get info done", name) except Exception as e: print("图片链接获取失败", e) return dic
def get_info(url): """ 获取详细内容 :param url: 菜品的url :return: 返回的字典 """ dic = { "菜名": "", "分类": "", "口味": "", "食材": "", "主要工艺": "", "制作时间": "", "做法": "", "图片url": "", } try: time.sleep(1) print("open", url) s = get_selector.get_selector(send_request.send_requests(url)) dic["图片url"] = s.xpath('//div[@class="cp_headerimg_w"]//img/@src')[0] dic["菜名"] = s.xpath('//h1[@class="title"]//a/text()')[0] dic["分类"] = s.xpath('//dl[@class="yj_tags clearfix"]//a/text()') dic["主要工艺"] = s.xpath('//li[@class="w127"]//a/text()')[0] dic["口味"] = s.xpath('//li[@class="w127 bb0"]//a/text()')[0] dic["制作时间"] = s.xpath('//li[@class="w270 bb0 br0"]//div[@class="processing_w"]//a/text()')[0] zhuliao = s.xpath('//div[@class="c"]//h4/child::*/text()') fuliao = s.xpath('//div[@class="yl fuliao clearfix"]//ul[@class="clearfix"]/descendant::*/text()') cailiao = [] for i in range(len(zhuliao)): if i % 2 == 0: temp = zhuliao[i] + ":" + zhuliao[i + 1] cailiao.append(temp) for i in range(len(fuliao)): if i % 2 == 0: temp = fuliao[i] + ":" + fuliao[i + 1] cailiao.append(temp) dic["食材"] = cailiao steps = s.xpath('//div[@class="editnew edit"]//div//p/text()') for i in range(len(steps)): steps[i] = str(i + 1) + steps[i] dic["做法"] = steps print(dic.get("菜名"), "done") except Exception as e: print("获取菜失败", e) return dic
def get_urls(menu_url): """ 获取界面上的菜的url :param menu_url: 某类食谱 :return: 该类食谱的所有菜的url """ urls_list = [] for i in range(11): time.sleep(1) try: url = menu_url + str(i + 1) print(url) s = get_selector.get_selector(send_request.send_requests(url)) urls = s.xpath('//div[@class="listtyle1"]//a/@href') urls_list.extend(urls) print("page"+str(i), "done") except Exception as e: print("打开失败", e) return urls_list
def get_info(url): """ 获取段落文字信息 (这里有个单词记错了,comment应该是content,不过问题应该不大) :param url: 详情界面url :return: """ try: s = get_selector.get_selector(send_request.send_requests(url)) comment = s.xpath('//div[@id="articleContent"]/p/following-sibling::*/text()') title = s.xpath('//h1/text()')[0] for s in range(len(comment)): comment[s] = comment[s].strip() comment[s] = comment[s].lstrip('。') comment[s] = comment[s].lstrip('(') comment[s] = comment[s].lstrip('?') comment[s] = comment[s].lstrip(')') comment[s] = comment[s].lstrip('!') comment[s] = comment[s].lstrip(':') comment[s] = comment[s].lstrip('、') comment[s] = comment[s].lstrip(',') comment[s] = comment[s].lstrip('》') comment[s] = comment[s].lstrip('《') comment[s] = comment[s].lstrip(':') for i in comment: if i is '': comment.remove(i) if len(i) is 1: comment.remove(i) try: index = comment.index("本篇文章版权归民福康健康所有,未经许可,谢绝转载。") s = "\n".join(comment[:index]) except: s = "\n".join(comment) print("正在获取", comment[0][:5], "...") dic = {} dic['title'] = title dic['content'] = s return dic except Exception as e: print(e) return ''
def get_urls(list): """ 获取该网站所有的贴 :param list: person_name_list :return: """ urls = [] for name in list: try: time.sleep(5) url = base_url + "/shufa/" + name print(name, "start") s = get_selector.get_selector(send_request.send_requests(url)) result = s.xpath('//div[@class="caption ellipsis"]//a/@href') for i in range(len(result)): result[i] = base_url + result[i] print(name, "done") urls.extend(result) except Exception as e: print(e) print(urls) return urls
def get_info(): s = get_selector.get_selector(send_request.send_requests(base_url)) name = s.xpath('//li//a[@rel="cpdq"]/text()') return name