def download(url, number, timeout): print("bitrating_alerts") reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网站是否加载成功 if reponse.status_code == 200: try: html = etree.HTML(reponse.text) # 获取标题、文章内容、作者和来源 title = html.xpath( '/html/body/section/div[1]/div/header/div[1]/h1/a/text()')[0] texts = html.xpath( '/html/body/section/div[1]/div/article/p/text()') author = html.xpath( '/html/body/section/div[1]/div/article/div/text()')[0].split( )[0] source = "比特评级--快讯:" + url # 新闻有推荐和声明的问题 recommends = html.xpath('//div[@class="asb-post-footer"]')[0] recommend = etree.tostring(recommends, method="text", encoding="utf8").decode("utf") recommend += ": https://bitrating.com/wenda" statement = html.xpath( '/html/body/section/div[1]/div/div[3]/text()')[0] # 将获取的数据存入数据库 storage(number, title, timeout, author, source, recommend, statement, texts) except Exception as err: mistake(url, err) else: err = reponse.status_code mistake(url, err)
def download(url, number): print('hashcaijing') reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网页是否加载成功 if reponse.status_code == 200: try: # 获取标题、发布时间、作者、来源 html = etree.HTML(reponse.text) title = html.xpath( '/html/body/div[2]/div[1]/div/div[1]/ul/li[1]/b/text()')[0] if not len(title): return True timeout = html.xpath( '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[1]/text()')[0] author = html.xpath( '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[2]/text()')[0] source = "哈希财经" + ":" + url texts = html.xpath('//div[@class="contentNews"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() storage(number, title, timeout, author, source, text) except Exception as err: mistake(url, err) else: err = reponse.status_code mistake(url, err)
def download(html, number, url): try: print("youjiatuanjian") # 获取标题 title = html.xpath( '//*[@id="article-wrap"]/div/div[1]/div[1]/text()')[0] # 获取信息来源 source_name = html.xpath( '//*[@id="article-wrap"]/div/div[1]/div[3]/p[1]/span/text()')[0] source_url = html.xpath( '//*[@id="article-wrap"]/div/div[2]/div[1]/div[1]/a/@href')[0] source = source_name + "--http://youjiatuanjian.com" + source_url # 获取发布时间 timeout = html.xpath( '//*[@id="article-wrap"]/div/div[1]/div[2]/div[2]/span/text()')[0] # 获取文章作者 author = html.xpath( '//*[@id="article-wrap"]/div/div[1]/div[2]/div[1]/span/text()')[0] # 获取信息内容 texts = html.xpath('//*[@id="article-wrap"]/div/div[1]/div[3]')[0] main_text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() mains = main_text[1:] storage(number, title, author, timeout, source, mains) except Exception as err: mistake(url, err)
def starts(headers): # 从首页开始查询网址 s = 0 url = "https://www.jinse.com" while True: reponse = requests.get(url, headers = headers) reponse.encoding = "utf-8" if reponse.status_code == 200: html = reponse.text # 获取首页所有的新闻网址 pattern = re.compile('[a-zA-z]+://www.jinse.com[^\s]*\.html') urls = re.findall(pattern, html) urls = list(set(urls)) for url in urls: if reponse.status_code == 200: titleUrl(url, headers) else: err = reponse.status_code mistake(url, err) break break else: # 有三次重新加载网页 if s == 3: err = reponse.status_code mistake(url, err) break s += 1
def download(number): # try: url = 'http://www.zhilianfm.com/zlfmCms/kx/%s.jhtml' % number reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: html = etree.HTML(reponse.text) classify = html.xpath( '/html/body/section/legend/a[2]/text()')[0].split()[0] if classify != "快讯": return print("zhilianfm_alerts") # 获取标题 title = html.xpath('/html/body/div[2]/section/h1/text()')[0] author_timeout_source = html.xpath( '/html/body/div[2]/section/div[1]/text()')[0].split() # 获取作者 author = author_timeout_source[1] # 获取发布时间 timeout = author_timeout_source[0] # 获取信息来源 source = author_timeout_source[2] # 获取文章内容 texts = html.xpath( '/html/body/div[2]/section/div[2]/text()')[0].split() text = "" for i in texts: text += i + " " storage(number, title, author, timeout, source, text, classify) else: err = reponse.status_code mistake(url, err) return True
def getUrl(reponse): html = reponse.text pattern = re.compile("news/[^\s]*\/\d+") urls = re.findall(pattern, html) url_number = [] # 在网页中获取信息的最大编号 for i in urls: pattern_num = re.compile("\d+") num = re.findall(pattern_num, i) url_number.append(num) max_number = int(max(url_number)[0]) reload = 0 while True: try: url = "https://www.btc123.com/news/newsDetails/%s" % max_number reponse_news = requests.get(url, headers=headers.header()) reponse_news.encoding = "utf-8" # 判断在数据库中是否已经存入 if rechecking(max_number, come_from="btc123"): break if reponse_news.status_code == 200: download(reponse_news, url, max_number) max_number -= 1 else: err = reponse_news.status_code mistake(url, err) # 可以重新加载三次网页 if reload == 3: break reload += 1 except: if reload == 3: break reload += 1
def starts(): url = "http://www.huoxing24.com/" try: # 从主页查找新闻信息 reponse = requests.get(url, headers=header()) reponse.encoding = "utf-8" html = reponse.text pattern = re.compile('<div class="index-news-list">[\s\S]*?<div class="shadow">') texts = re.findall(pattern, html) # print(texts) n = 1 for text in texts: # print(n) n += 1 # 进行遍历,加载新闻网址 pattern = re.compile('[a-zA-z]+://[^\s]*\.html') url = re.findall(pattern, text)[0] reponse = requests.get(url, headers=header()) reponse.encoding = "utf-8" # 判断网址能否加载 if reponse.status_code == 200: html = reponse.text download(html, url) else: err = reponse.status_code mistake(url, err) except Exception as err: mistake(url, err)
def starts(): # 网站新闻信息网址 urls = [ 'http://39.108.117.97:8082/hotNewsList?size=10&page=%s&subType=', "http://39.108.117.97:8082/blockChainList?size=10&page=%s&subType=" ] for i in urls: subType = 0 # 新闻信息的分类,都分为3类共6类新闻 while subType < 3: page = 1 url_page = i + str(subType) # 新闻信息翻页获取url while True: url = url_page % page reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = getUrl(reponse) if data: break else: err = reponse.status_code mistake(url, err) break page += 1 subType += 1
def download(reponse, url, number): try: print("tuoluocaijing") html = etree.HTML(reponse.text) # 获取标题 title = html.xpath('/html/body/div[6]/div[1]/div/h1/text()')[0] # 获取发布时间 timeout = html.xpath( '/html/body/div[6]/div[1]/div/div[1]/span[3]/text()')[0] # 获取作者及作者的网址 author_name = html.xpath( '/html/body/div[6]/div[1]/div/div[1]/span[1]/a/text()')[0] author_ur = html.xpath( '/html/body/div[6]/div[1]/div/div[1]/span[1]/a/@href')[0] author = author_name + "--https://www.tuoluocaijing.cn" + author_ur # 获取标签 label = html.xpath('/html/body/div[6]/div[1]/div/div[3]/a/text()') # 获取正文 texts = html.xpath("/html/body/div[6]/div[1]/div/div[2]")[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # 文章有声明 statement = html.xpath('/html/body/div[6]/div[1]/div/p/text()') source = "陀螺财经--:https://www.tuoluocaijing.cn/" storage(number, title, timeout, author, source, text, statement, label) except Exception as err: mistake(url, err)
def starts(): n = 1 reload = 0 while True: url = "http://weilaicaijing.com/api/Fastnews/lists?search_str=&page=%s" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网页是否加载完成 if reponse.status_code == 200: already = download(reponse, url) if already: break n += 1 # 此网站常出现503错误,重新加载5次 elif reponse.status_code == 503: if reload == 5: err = reponse.status_code mistake(url, err) break reload += 1 else: # 网页有三次重新加载 if reload == 2: err = reponse.status_code mistake(url, err) break reload += 1
def download(number): # 连接新闻具体信息 url = 'http://39.108.117.97:8082/geek/infoDetail/1/%s' % number reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: # 获取信息的内容 html = reponse.text # 将json格式文本转换为字典 texts = json.loads(html) data = texts["data"] # 转换时间,获取发布时间 num = float(data["releasedTime"]) / 1000 timeout = time.asctime(time.localtime(num)) # 在html文本中获取新闻内容 main_html = data["content"] main_html = etree.HTML(main_html) main_text = etree.tostring(main_html, method="text", encoding="utf8").decode("utf8").split() # 获取所使用的图片 img = main_html.xpath('//img/@src') storage(data, timeout, main_text, img) else: err = reponse.status_code mistake(url, err)
def divide(i): try: print("tuoniaox_alerts") # 将从网页中用正则匹配到的局部html内容,变为xml格式的文档 html = etree.HTML(i) # 获取实时的发布时间 timeout = html.xpath('//span/text()')[0] # 获取信息的内容 texts = html.xpath('//p/text()')[0].split() text = "" for i in texts: text += i + " " # 分离出标题以及其发布的时间日期 pattern = re.compile("【[\s\S]*?】") title = re.findall(pattern, text)[0] # 由于没有明确的id值 if title_find(title, come_from="tuoniaox_alerts"): return pattern = re.compile("\d月\d日") accurate = re.findall(pattern, text) if accurate: accurate = accurate[0] else: accurate = "" author = "鸵鸟区块链:https://www.tuoniaox.com/" # 判断是否是该网站的原创作 source = html.xpath('//a/@href') if source: source = "负责编译--原文:" + source else: source = "鸵鸟区块链--快讯" storage(title, author, timeout, accurate, source, text) except Exception as err: mistake(url="https://www.tuoniaox.com/", err=err)
def getUrl(reponse): pattern = re.compile('/kuaixun/detail-\d+\.html') urls = re.findall(pattern, reponse.text) urls = list(set(urls)) for i in urls: reload = 0 try: url = "https://www.tuoluocaijing.cn" + i reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: pattern_num = re.compile("\d+") number = re.findall(pattern_num, url)[0] if rechecking(number, come_from="tuoluocaijing_alerts"): break download(reponse, number, url) else: err = reponse.status_code mistake(url, err) if reload == 3: break reload += 1 except: if reload == 3: break reload += 1
def download(url): print("jinse_alerts") reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网页是否加载完成 if reponse.status_code == 200: # 匹配时间正文再加以组合 try: html = etree.HTML(reponse.text) texts = html.xpath('//*[@class="tc"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() times = text[0] + text[1] texts = html.xpath('//*[@class="time-detail"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() times = times + "--" + text[0] texts = html.xpath('//*[@class="intro-detail"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() titles = text[0:3] authors = text[-5:] mains = text[3:-5] storage(titles, authors, times, url, mains) except Exception as err: mistake(url, err) return False else: err = reponse.status_code mistake(url, err) return True
def starts(): n = 8208 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="polo321_alerts") if number: n = number + 20 tf = False while True: try: url = "http://39.108.117.97:8082/lives/getList?Id=%s&flag=down" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = classify(reponse) if data: break else: err = reponse.status_code mistake(url, err) break # 主要是运行的第一次 if tf: n -= 20 else: n += 20 except TimeoutError: time.sleep(10)
def download(html, number, url): # 查看文章是否引用图片 img = html.xpath('//*[@id="main"]/div/div[1]/div[1]/p/img/@src') if not img: try: # 没有引用图片则可能是快讯信息 text = html.xpath('//*[@id="main"]/div/div[1]/div[1]/p/text()') # 再次判断,以文章的内容作为依据判断是否是快讯信息 if len(text) == 1 and text[0] == "\n": pass else: print("longkuai_alerts") # 获取标题 title = html.xpath('//*[@id="main"]/div/div[1]/div[1]/h1/text()')[0] # 获取时间 timeout = html.xpath('//*[@id="main"]/div/div[1]/div[1]/div[1]/span[1]/text()')[0] # 本文的作者获取编辑 author = html.xpath('//*[@id="main"]/div/div[2]/div[1]/h1/text()')[0] # 内容的来源或者是原创 content_source_object = html.xpath('//*[@id="main"]/div/div[1]/div[1]/div[2]')[0] content_source_text = etree.tostring(content_source_object, method="text", encoding="utf8").decode("utf8").split() content_source = "" for i in content_source_text: content_source += i # 文章的声明 statement = html.xpath('//*[@id="main"]/div/div[1]/div[1]/div[3]/text()')[0] storage(number, title, author, timeout, content_source, statement, text) except Exception as err: mistake(url, err)
def download(url, number): print("shilian_alerts") reponse = requests.get(url, headers=headers.header()) reponse.encoding = "gbk" if reponse.status_code == 200: try: html = etree.HTML(reponse.text) # 获取标题、文章内容、作者和来源 title = html.xpath('/html/body/div[2]/div[1]/div/h1/text()')[0] theSidebar = html.xpath('/html/body/div[2]/div[1]/div/div[1]/text()') author = "世链财经--快讯:" + theSidebar[0] timeout = theSidebar[1] # 信息的分类 classify = html.xpath('/html/body/div[2]/div[1]/div/div[1]/a/text()') source = ("世链财经--资讯--%s:" % classify) + url texts = html.xpath('/html/body/div[2]/div[1]/div/div[3]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # 新闻有声明的问题 statements = html.xpath('/html/body/div[2]/div[1]/div/div[5]')[0] statement = etree.tostring(statements, method="text", encoding="utf8").decode("utf8").split() # 信息的标签 label_head = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/text()')[0] label_word = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/text()')[0] label_url = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/@href')[0] label = label_head + label_word + "--http://www.shilian.com" + label_url # 将获取的数据存入数据库 storage(number, title, timeout, author, source, statement, text, label) except Exception as err: mistake(url, err) else: err = reponse.status_code mistake(url, err)
def download(url): print("huoxing24_alerts") reponse = requests.get(url, headers=header()) reponse.encoding = "utf-8" if reponse.status_code == 200: try: # 获取编号 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[1] # 判断数据库中是否已经下载过 if rechecking(number, come_from="huoxing_alerts"): return html = reponse.text down = etree.HTML(html) texts = down.xpath('/html/body/div[5]/div[1]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # print(text) time = text[1] + text[0] + "日" + "--" + text[3] + "--" + text[4] # print(time) title = text[5] # print(title) mains = text[6:-4] main = "" for i in mains: main += i + " " # print(main) source = "火星财经快讯" storage(title, time, source, main) except Exception as err: mistake(url, err) else: err = "reponse.status_code为:" + reponse.status_code mistake(url, err)
def starts(): # 由于快讯网址上的快讯信息不是顺序排列,是乱序排列,采用已有的编号来爬取 n = 8300 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="hangliancj_alerts") if number: n = number + 1 tf = False while True: try: url = "http://hangliancj.com/article/%s.html" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = download(url, reponse, n) if data: break else: err = reponse.status_code mistake(url, err) break # 主要是运行的第一次 if tf: n -= 1 else: n += 1 except TimeoutError: time.sleep(10)
def download(url, html): try: print("fn") # 筛选数据 # 新闻的编号 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="fn"): return # 发布时间 pattern_time = re.compile('([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))') times = re.findall(pattern_time, html)[0][0: 2] html = etree.HTML(html) # 新闻标题 texts = html.xpath('//h1[@class="entry-title"]')[0] title = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()[0] # 新闻作者 texts = html.xpath('//div[@class="entry-info"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() num = text.index('•') authors = text[0: num] # 新闻导读"副标题" texts = html.xpath('//div[@class="entry-excerpt"]')[0] subtitle = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()[0] # 新闻正文 texts = html.xpath('//div[@class="entry-content clearfix"]')[0] mains = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() main = mains[0: -1] # 新闻来源 source = main[-1] # 进行存储 storage(title, authors, times, source, main, number, subtitle) except Exception as err: mistake(url, err)
def titleUrl(url, headers): s = 0 while True: try: reponse = requests.get(url, headers = headers) reponse.encoding = "utf-8" # 查询在数据库中是否已经存在 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="jinse"): break # 判断是否加载完成 if reponse.status_code == 200: download(url, reponse, number) # 匹配下一篇新闻的url html = reponse.text pattern = re.compile('<ol>下一篇</ol>[\s\S]*?</h2>') texts = re.findall(pattern, html)[0] # print(texts) pattern = re.compile('https://[\s\S]*?\d+.html') url = re.findall(pattern, texts)[0] # print(href) else: err = reponse.status_code mistake(url, err) if s == 3: break s += 3 except: if s == 3: break s += 1
def starts(): urls = [ "https://www.7234.cn/fetch_articles/news", "https://www.7234.cn/fetch_articles/blockchain", "https://www.7234.cn/fetch_articles/tech", "https://www.7234.cn/fetch_articles/huodong", "https://www.7234.cn/fetch_articles/column" ] for i in urls: n = 1 while True: url = i + "?page=%s" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: # 将json格式html文本转换为字典格式 data = reponse.text data = json.loads(data) html = etree.HTML(data["html"]) data = getUrl(html) if data: break n += 1 else: err = reponse.status_code mistake(url, err) break
def starts(): n = 1 s = 0 while True: url = "https://apibtc.btc123.com/v1/index/getFlashPage?pageSize=20&pageNumber=%s" % n try: reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网页是否加载完成 if reponse.status_code == 200: data = download(reponse, url) if data: break n += 1 else: err = reponse.status_code mistake(url, err) # 网页有三次重新加载 if s == 2: break s += 1 except: # 网页可以重新加载 if s == 2: break s += 1
def starts(): n = 2016 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="shangxia_alerts") if number: n = number + 1 tf = False while True: try: url = "https://www.shangxia.net/kuaixun/1/%s.html" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = download(reponse, n, url) if data: break else: err = reponse.status_code mistake(url, err) break # 主要是运行的第一次 if tf: n -= 1 else: n += 1 except TimeoutError: time.sleep(10)
def download(urls): for url_one in urls: try: print("hecaijing_alerts") url = "https://www.hecaijing.com" + url_one reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" html = etree.HTML(reponse.text) if reponse.status_code == 200: # 获取编号 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="hecaijing_alerts"): return times = html.xpath('/html/body/div[5]/div[1]/h2/text()') time_hour = html.xpath( '/html/body/div[5]/div[1]/div/p[1]/span/text()')[0].split( ) timeout = times[1] + time_hour[0] title = html.xpath( '/html/body/div[5]/div[1]/div/p[2]/text()')[0] main_texts = html.xpath( '/html/body/div[5]/div[1]/div/div[1]/text()')[0].split() source = "核财经:" + url storage(number, title, timeout, main_texts, source) else: err = reponse.status_code mistake(url, err) except Exception as err: mistake(url_one, err)
def download(reponse, number, url): try: print("shangxia_alerts") html = etree.HTML(reponse.text) # 获取标题 title = html.xpath('//*[@id="title"]/text()')[0] if not title: return True # 获取作者 author = html.xpath('//div[@class="title_trade2"]/a/text()')[0] # 获取发布时间 timeout = html.xpath('//div[@class="title_trade2"]/text()')[0].split()[1:3] # 获取信息来源 source = html.xpath('/html/body/div[11]/div[6]/text()')[1].split()[0] # 文章的声明 statement_object = html.xpath('/html/body/div[11]/div[7]')[0] statement_list = etree.tostring(statement_object, method="text", encoding="utf8").decode("utf8").split() statement = "" for i in statement_list: statement += i + " " # 获取文章内容 texts = html.xpath('//*[@id="content"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # 获取信息使用的图片 img_img = html.xpath('//*[@id="content"]/img/@src') img_div = html.xpath('//*[@id="content"]/div/img/@src') img_p = html.xpath('//*[@id="content"]/p/img/@src') img = img_img + img_div + img_p storage(number, title, author, timeout, source, text, statement, img) except Exception as err: mistake(url, err)
def download(html, author, number, url): try: print("epcnn") # 获取标题 title = html.xpath( '/html/body/section/div[1]/div/header/h1/a/text()')[0] # 获取发布时间 timeout = html.xpath( '/html/body/section/div[1]/div/header/div/span[1]/text()')[0] # 文章的分类 classify = html.xpath( '/html/body/section/div[1]/div/header/div/span[2]/a/text()')[0] # 文章的标签 label = html.xpath('/html/body/section/div[1]/div/div[5]/a/text()') # 本网站此文章的声明 statement = html.xpath( '/html/body/section/div[1]/div/div[3]/text()')[0] # 所引用到的图片 img = html.xpath('/html/body/section/div[1]/div/article//img/@src') # 文章的来源位置 source_location = html.xpath('/html/body/div[2]/div/a/text()') source = "e能链财经" for i in source_location: source += "-" + i # 文章的正文内容 texts = html.xpath('/html/body/section/div[1]/div/article')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8") storage(number, title, author, timeout, source, text, label, classify, statement, img) except Exception as err: mistake(url, err)
def starts(): n = 33588 # 用与判断获取路径是获取上一篇还是下一篇 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="chainfor") if number: n = int(number) + 1 tf = False url = "https://www.chainfor.com/news/show/%s.html" % n while True: reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = getUrl(reponse, tf, url) if data == "end": break elif data == "continue": continue # 返回路径,跟新 url = data else: err = reponse.status_code mistake(url, err) break
def starts(): url = "http://bishequ.com/article/getArticleList" reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: connent(reponse) else: err = reponse.status_code mistake(url, err)
def starts(): url = "http://www.btc798.com/" reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: getUrl(reponse) else: err = reponse.status_code mistake(url, err)