def get_urls(all_author): """ 获取书法家的所有文章 :param all_author: :return: """ all_author_dic = {} for name in all_author: try: page = 1 url = all_author[name] + "{index}.html".format(index=page) # print(url) s = get_selector.get_selector(send_request.send_requests(url)) result = s.xpath('//div[@class="navBtn font1_2"]/text()')[0] page_max = result[3:result.index('页')] result_dic = {} for er_page in range(int(page_max)): try: url = all_author[name] + "{index}.html".format( index=er_page + 1) # print("doing", name, "page{page}".format(page=er_page + 1)) s = get_selector.get_selector( send_request.send_requests(url)) result_url = s.xpath('//li//div[@class="b"]//h4/a/@href') result_title = [None] * len(result_url) for i in range(len(result_url)): try: s = get_selector.get_selector( send_request.send_requests( base_url + result_url[i].strip('.'))) # print("正在打开 {url_} 获取title".format(url_=base_url + result_url[i].strip('.'))) result_title[i] = s.xpath( '//div[@class="a"]/h1/text()')[0] if "�" in result_title[i]: result_title[i] = result_title.replace( "�", "頫") if "\\" in result_title[i]: result_title[i] = result_title[i].replace( "\\", "") result_dic[result_title[ i]] = base_url + result_url[i].strip('.') except: pass print("done", name, "page{page}".format(page=er_page + 1)) except Exception as e: print("error:获取这一篇文章出错", url, e) # print(result_dic) all_author_dic[name] = result_dic except Exception as e: print("error:获取这一页出错了", e) return all_author_dic
def get_img_info(dic): """ 获取图片并下载 :return: """ for name in dic: path = create_dir(r"/root/bao", name) #先创建任务文件夹 print("***正在弄***", name) for title in dic[name]: try: all_imgs = [] # print("正在下载", title, dic[name][title]) file_path = create_dir(path, title) # 再创建文章文件夹 s = get_selector.get_selector( send_request.send_requests(dic[name][title])) result_page = s.xpath('//td//a[@rel="nofollow"]/text()') img_list = s.xpath('//div[@id="newsContent"]//img/@src') if img_list is not []: for i in range(len(img_list)): if img_list[i][0] is '.': img_list[i] = base_url + img_list[i].lstrip('.') all_imgs.extend(img_list) if result_page is not []: max_page = int(result_page[-2].strip('.')) else: max_page = 1 if max_page > 1: for er_page in range(max_page)[1:]: url = dic[name][title].rstrip('.html') url = url + "_{page}.html".format(page=er_page + 1) s = get_selector.get_selector( send_request.send_requests(url)) # print("获取图片链接", url) img_list = s.xpath( '//div[@id="newsContent"]//img/@src') if img_list is not []: for j in range(len(img_list)): if img_list[j][0] is '.': img_list[ j] = base_url + img_list[j].lstrip('.') all_imgs.extend(img_list) count = 1 for img in all_imgs: filename = file_path + "\\{i}.jpg".format(i=count) # print("downloading", filename) down_load(img, filename) count = count + 1 except Exception as e: print("下载图片错了", e)
def get_img_element_id(url): try: s = get_selector.get_selector(send_request.send_requests(url)) id_list = s.xpath('//div[@id="beitie-imgs-container"]//div/@id') return id_list except Exception as e: print(e)
def get_url(name): s = get_selector.get_selector(send_request.send_requests(search_url + name)) url = s.xpath('//li[@class="item clearfix"][1]//h4//a/@href')[0] url = base_url + url print(url) return url
def get_info(url): """ :param url: :return: 返回的a是营养价值,b是食用效果 """ dic = {"食材": "", "简介": "", "营养价值": "", "食用效果": ""} try: s = get_selector.get_selector(send_request.send_requests(url)) # 储存功效信息 fun = s.xpath( '//div[@class="fd_yygx"]//strong[1]/following-sibling::*') final_fun = [] intro = s.xpath('//div[@class="n_sp_main_info"]//p/text()') dic["食材"] = s.xpath('//div[@class="fd_tit2"]//strong/text()') t = [] jianjie = s.xpath('//div[@class="n_sp_main_info"]//p/text()') for i in jianjie: t.append(i.strip()) dic["简介"] = t for i in range(len(fun)): info = s.xpath( '//div[@class="fd_yygx"]//strong[1]/following-sibling::*[%d]/text()' % (i + 1)) if info is not None: final_fun = final_fun + info for m in range(len(final_fun)): final_fun[m] = final_fun[m].strip() for i in range(len(intro)): intro[i] = intro[i].strip() final_fun.remove('') for i in range(len(final_fun)): if "的挑选" in final_fun[i]: final_fun = final_fun[0:i] break for i in range(len(final_fun)): if "的存储方法" in final_fun[i]: final_fun = final_fun[0:i] break a = [] b = [] for i in range(len(final_fun)): if "的食用效果" in final_fun[i]: a = final_fun[1:i] b = final_fun[i + 1:] if len(a) is 0: print(final_fun[0]) if "的营养价值" in final_fun[0]: print("doit") a = final_fun[1:] print(a) print(b) dic["营养价值"] = a dic["食用效果"] = b return dic except Exception as e: print(e)
def get_info(url): dic = {} s = get_selector.get_selector(send_request.send_requests(url)) name = s.xpath('//h2[@class="crumb"]/text()')[-1].strip().replace('/ ', '') calorie = s.xpath( '//span[@id="food-calory"]//span[@class="stress red1"]/text()')[0] img_url = s.xpath('//div[@class="food-pic pull-left"]//a/@href')[0] dic["new_name"] = name dic["calorie"] = calorie dic["img_url"] = img_url return dic
def get_urls(): """ :return: """ urls = [] s = get_selector.get_selector(send_request.send_requests(aim_url)) m = [8, 9, 10, 14, 15, 25] for i in m: url = s.xpath('//div[@class="fd_cont"]//dl[%d]//a/@href' % i) urls = urls + url return urls
def get_img_url(name): base_url = "http://so.meishi.cc/index.php?q=" try: time.sleep(2) s = get_selector.get_selector( send_request.send_requests(base_url + name)) img = s.xpath( '//div[@class="search2015_cpitem"][1]//a[@class="img"]//img/@src') print(name, img) return img[0] except Exception as e: print(e)
def get_url(page_url): """ 该方法返回当前界面的所有url :param page_url: 某一页的网址 :return: """ try: s = get_selector.get_selector(send_request.send_requests(page_url)) urls = s.xpath('//div[@class="item_all"]//p[@class="item_tit"]//a/@href') return urls except Exception as e: print(e) return []
def get_urls(name_list): all_tie = {} for name in name_list: try: url = "/".join((base_url, "shufa", name)) print("正在处理", name, url) s = get_selector.get_selector(send_request.send_requests(url)) result = s.xpath('//div[@class="col-md-4 col-sm-6"]//a/@href') for i in result: all_tie[i.split('/')[-2]] = base_url + i print(all_tie) except Exception as e: print("字帖获取失败了", e) return all_tie
def get_info(url): s = get_selector.get_selector(send_request.send_requests(url)) info = s.xpath('//div[@class="content"]//dd[2]/following::dd//span/text()') info_dic = {} for i in range(int(len(info) / 2)): try: info_dic[info[i * 2]] = info[(i + 1) * 2 - 1] except: pass for ingredient in info_dic: try: float(info_dic[ingredient]) except: info_dic[ingredient] = '-1' return info_dic
def get_info(dic): for name in dic: try: s = get_selector.get_selector(send_request.send_requests( dic[name])) img1 = base_url + s.xpath('//div[@id="a0"]/img/@src')[0] img_urls = s.xpath( '//div[@id="a0"]/following-sibling::*/@data-original') for i in range(len(img_urls)): img_urls[i] = base_url + img_urls[i] img_urls.append(img1) dic[name] = img_urls print("get info done", name) except Exception as e: print("图片链接获取失败", e) return dic
def get_info(url): """ 获取详细内容 :param url: 菜品的url :return: 返回的字典 """ dic = { "菜名": "", "分类": "", "口味": "", "食材": "", "主要工艺": "", "制作时间": "", "做法": "", "图片url": "", } try: time.sleep(1) print("open", url) s = get_selector.get_selector(send_request.send_requests(url)) dic["图片url"] = s.xpath('//div[@class="cp_headerimg_w"]//img/@src')[0] dic["菜名"] = s.xpath('//h1[@class="title"]//a/text()')[0] dic["分类"] = s.xpath('//dl[@class="yj_tags clearfix"]//a/text()') dic["主要工艺"] = s.xpath('//li[@class="w127"]//a/text()')[0] dic["口味"] = s.xpath('//li[@class="w127 bb0"]//a/text()')[0] dic["制作时间"] = s.xpath('//li[@class="w270 bb0 br0"]//div[@class="processing_w"]//a/text()')[0] zhuliao = s.xpath('//div[@class="c"]//h4/child::*/text()') fuliao = s.xpath('//div[@class="yl fuliao clearfix"]//ul[@class="clearfix"]/descendant::*/text()') cailiao = [] for i in range(len(zhuliao)): if i % 2 == 0: temp = zhuliao[i] + ":" + zhuliao[i + 1] cailiao.append(temp) for i in range(len(fuliao)): if i % 2 == 0: temp = fuliao[i] + ":" + fuliao[i + 1] cailiao.append(temp) dic["食材"] = cailiao steps = s.xpath('//div[@class="editnew edit"]//div//p/text()') for i in range(len(steps)): steps[i] = str(i + 1) + steps[i] dic["做法"] = steps print(dic.get("菜名"), "done") except Exception as e: print("获取菜失败", e) return dic
def get_urls(menu_url): """ 获取界面上的菜的url :param menu_url: 某类食谱 :return: 该类食谱的所有菜的url """ urls_list = [] for i in range(11): time.sleep(1) try: url = menu_url + str(i + 1) print(url) s = get_selector.get_selector(send_request.send_requests(url)) urls = s.xpath('//div[@class="listtyle1"]//a/@href') urls_list.extend(urls) print("page"+str(i), "done") except Exception as e: print("打开失败", e) return urls_list
def get_info(url): """ 获取段落文字信息 (这里有个单词记错了,comment应该是content,不过问题应该不大) :param url: 详情界面url :return: """ try: s = get_selector.get_selector(send_request.send_requests(url)) comment = s.xpath('//div[@id="articleContent"]/p/following-sibling::*/text()') title = s.xpath('//h1/text()')[0] for s in range(len(comment)): comment[s] = comment[s].strip() comment[s] = comment[s].lstrip('。') comment[s] = comment[s].lstrip('(') comment[s] = comment[s].lstrip('?') comment[s] = comment[s].lstrip(')') comment[s] = comment[s].lstrip('!') comment[s] = comment[s].lstrip(':') comment[s] = comment[s].lstrip('、') comment[s] = comment[s].lstrip(',') comment[s] = comment[s].lstrip('》') comment[s] = comment[s].lstrip('《') comment[s] = comment[s].lstrip(':') for i in comment: if i is '': comment.remove(i) if len(i) is 1: comment.remove(i) try: index = comment.index("本篇文章版权归民福康健康所有,未经许可,谢绝转载。") s = "\n".join(comment[:index]) except: s = "\n".join(comment) print("正在获取", comment[0][:5], "...") dic = {} dic['title'] = title dic['content'] = s return dic except Exception as e: print(e) return ''
def get_urls(list): """ 获取该网站所有的贴 :param list: person_name_list :return: """ urls = [] for name in list: try: time.sleep(5) url = base_url + "/shufa/" + name print(name, "start") s = get_selector.get_selector(send_request.send_requests(url)) result = s.xpath('//div[@class="caption ellipsis"]//a/@href') for i in range(len(result)): result[i] = base_url + result[i] print(name, "done") urls.extend(result) except Exception as e: print(e) print(urls) return urls
def get_info(): s = get_selector.get_selector(send_request.send_requests(base_url)) name = s.xpath('//li//a[@rel="cpdq"]/text()') return name