def starts(): url = "http://news.enorth.com.cn/system/count/0017000/000000000000/count_page_list_0017000000000000000.js" response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text pattern = re.compile('(maxpage = )(8287)(;)') page = int(re.findall(pattern, html)[0][1]) else: return n = 0 while True: if n == 0: url = "http://news.enorth.com.cn/gd/" else: url = "http://news.enorth.com.cn/system/more/17000000000000000/0082/17000000000000000_0000%s.shtml" % page response = requests.get(url, headers=headers.header()) response.encoding = "gb2312" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break page -= n n += 1 else: break
def starts(): url = "https://heilongjiang.dbw.cn/system/count/0015037/000000000000/count_page_list_0015037000000000000.js" response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text pattern = re.compile('(maxpage = )(\d+)(;)') num = int(re.findall(pattern, html)[0][1]) else: return n = 0 while True: if n == 0: url = "https://heilongjiang.dbw.cn/rc/index.shtml" else: page = num - n url = "https://heilongjiang.dbw.cn/system/count//0015037/000000000000/000/000/c0015037000000000000_000000%s.shtml" % page response = requests.get(url, headers=headers.header()) response.encoding = "gbk" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake == "end": break n += 1 else: break
def starts(): url = "http://www.huoxing24.com/" try: # 从主页查找新闻信息 reponse = requests.get(url, headers=header()) reponse.encoding = "utf-8" html = reponse.text pattern = re.compile('<div class="index-news-list">[\s\S]*?<div class="shadow">') texts = re.findall(pattern, html) # print(texts) n = 1 for text in texts: # print(n) n += 1 # 进行遍历,加载新闻网址 pattern = re.compile('[a-zA-z]+://[^\s]*\.html') url = re.findall(pattern, text)[0] reponse = requests.get(url, headers=header()) reponse.encoding = "utf-8" # 判断网址能否加载 if reponse.status_code == 200: html = reponse.text download(html, url) else: err = reponse.status_code mistake(url, err) except Exception as err: mistake(url, err)
def starts(): urls = [ "http://www.chinanews.com/scroll-news/news%s.html", "http://www.chinanews.com/mil/news.shtml" ] n = 1 for i in urls: if n == 1: while True: url = i % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "gbk" if reponse.status_code == 200: html = reponse.text msg = getURL(html) if msg: break n += 1 else: break else: url = i reponse = requests.get(url, headers=headers.header()) reponse.encoding = "gbk" if reponse.status_code == 200: html = reponse.text getURL(html)
def download(url, number): print("shilian_alerts") reponse = requests.get(url, headers=headers.header()) reponse.encoding = "gbk" if reponse.status_code == 200: try: html = etree.HTML(reponse.text) # 获取标题、文章内容、作者和来源 title = html.xpath('/html/body/div[2]/div[1]/div/h1/text()')[0] theSidebar = html.xpath('/html/body/div[2]/div[1]/div/div[1]/text()') author = "世链财经--快讯:" + theSidebar[0] timeout = theSidebar[1] # 信息的分类 classify = html.xpath('/html/body/div[2]/div[1]/div/div[1]/a/text()') source = ("世链财经--资讯--%s:" % classify) + url texts = html.xpath('/html/body/div[2]/div[1]/div/div[3]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # 新闻有声明的问题 statements = html.xpath('/html/body/div[2]/div[1]/div/div[5]')[0] statement = etree.tostring(statements, method="text", encoding="utf8").decode("utf8").split() # 信息的标签 label_head = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/text()')[0] label_word = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/text()')[0] label_url = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/@href')[0] label = label_head + label_word + "--http://www.shilian.com" + label_url # 将获取的数据存入数据库 storage(number, title, timeout, author, source, statement, text, label) except Exception as err: mistake(url, err) else: err = reponse.status_code mistake(url, err)
def starts(): n = 8208 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="polo321_alerts") if number: n = number + 20 tf = False while True: try: url = "http://39.108.117.97:8082/lives/getList?Id=%s&flag=down" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = classify(reponse) if data: break else: err = reponse.status_code mistake(url, err) break # 主要是运行的第一次 if tf: n -= 20 else: n += 20 except TimeoutError: time.sleep(10)
def connect(url, number, title): url = "http:" + url response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text download(html, number, title)
def starts(): n = 1 s = 0 while True: url = "https://apibtc.btc123.com/v1/index/getFlashPage?pageSize=20&pageNumber=%s" % n try: reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网页是否加载完成 if reponse.status_code == 200: data = download(reponse, url) if data: break n += 1 else: err = reponse.status_code mistake(url, err) # 网页有三次重新加载 if s == 2: break s += 1 except: # 网页可以重新加载 if s == 2: break s += 1
def starts(): url = "https://www.huxiu.com/v2_action/article_list" n = 1 last_dateline = "" while True: data = { "huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa", "page": str(n), "last_dateline": str(last_dateline) } if n == 1: url = "https://www.huxiu.com" response = requests.get(url, headers=headers.header()) else: response = requests.get(url, data=data, headers=header) response.encoding = "utf-8" if response.status_code == 200: html = response.text mistake = getURL(html, n) if mistake == "end": break last_dateline = mistake n += 1 else: break
def starts(): urls = [ "https://www.thepaper.cn/channel_25950", "https://www.thepaper.cn/channel_25951", "https://www.thepaper.cn/channel_36079", "https://www.thepaper.cn/channel_25952", "https://www.thepaper.cn/channel_25953", "https://www.thepaper.cn/gov_publish.jsp" ] for i in urls: n = 1 lastTime = "" while True: url = combination(i, n, lastTime) reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: html = reponse.text data = getURL(html, n, i) if data == "end": break lastTime = data n += 1 else: break
def download(url, number): print('hashcaijing') reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 判断网页是否加载成功 if reponse.status_code == 200: try: # 获取标题、发布时间、作者、来源 html = etree.HTML(reponse.text) title = html.xpath( '/html/body/div[2]/div[1]/div/div[1]/ul/li[1]/b/text()')[0] if not len(title): return True timeout = html.xpath( '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[1]/text()')[0] author = html.xpath( '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[2]/text()')[0] source = "哈希财经" + ":" + url texts = html.xpath('//div[@class="contentNews"]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() storage(number, title, timeout, author, source, text) except Exception as err: mistake(url, err) else: err = reponse.status_code mistake(url, err)
def starts(): url = "http://news.mtime.com/#nav" response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text getURL(html)
def download(number): # try: url = 'http://www.zhilianfm.com/zlfmCms/kx/%s.jhtml' % number reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: html = etree.HTML(reponse.text) classify = html.xpath( '/html/body/section/legend/a[2]/text()')[0].split()[0] if classify != "快讯": return print("zhilianfm_alerts") # 获取标题 title = html.xpath('/html/body/div[2]/section/h1/text()')[0] author_timeout_source = html.xpath( '/html/body/div[2]/section/div[1]/text()')[0].split() # 获取作者 author = author_timeout_source[1] # 获取发布时间 timeout = author_timeout_source[0] # 获取信息来源 source = author_timeout_source[2] # 获取文章内容 texts = html.xpath( '/html/body/div[2]/section/div[2]/text()')[0].split() text = "" for i in texts: text += i + " " storage(number, title, author, timeout, source, text, classify) else: err = reponse.status_code mistake(url, err) return True
def download(url): print("huoxing24_alerts") reponse = requests.get(url, headers=header()) reponse.encoding = "utf-8" if reponse.status_code == 200: try: # 获取编号 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[1] # 判断数据库中是否已经下载过 if rechecking(number, come_from="huoxing_alerts"): return html = reponse.text down = etree.HTML(html) texts = down.xpath('/html/body/div[5]/div[1]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # print(text) time = text[1] + text[0] + "日" + "--" + text[3] + "--" + text[4] # print(time) title = text[5] # print(title) mains = text[6:-4] main = "" for i in mains: main += i + " " # print(main) source = "火星财经快讯" storage(title, time, source, main) except Exception as err: mistake(url, err) else: err = "reponse.status_code为:" + reponse.status_code mistake(url, err)
def starts(): urls = [ "http://www.taiwan.cn/taiwan/index.htm", "http://www.taiwan.cn/plzhx/index.htm", "http://www.taiwan.cn/lilunpindao/index.htm", "http://www.taiwan.cn/xwzx/la/index.htm", "http://www.taiwan.cn/xwzx/index.htm", "http://culture.taiwan.cn/index.htm" ] for i in urls: n = 0 while True: if n == 0: url = i else: page = "_" + str(n) + ".htm" url = re.sub(".htm", page, i) response = requests.get(url, headers=headers.header()) response.encoding = "gbk" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def starts(): urls = [ "http://www.qlwb.com.cn/news/domesticnews/%s.shtml", "http://www.qlwb.com.cn/news/overseas/%s.shtml", "http://www.qlwb.com.cn/news/SocialNews/%s.shtml", "http://www.qlwb.com.cn/news/sports/%s.shtml", "http://www.qlwb.com.cn/news/importantnews/%s.shtml", "http://www.qlwb.com.cn/news/commentary/%s.shtml", "http://yule.qlwb.com.cn/" ] for i in urls: n = 1 while True: if i == "http://yule.qlwb.com.cn": url = i else: url = i % n response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def getUrl(reponse): html = reponse.text pattern = re.compile("news/[^\s]*\/\d+") urls = re.findall(pattern, html) url_number = [] # 在网页中获取信息的最大编号 for i in urls: pattern_num = re.compile("\d+") num = re.findall(pattern_num, i) url_number.append(num) max_number = int(max(url_number)[0]) reload = 0 while True: try: url = "https://www.btc123.com/news/newsDetails/%s" % max_number reponse_news = requests.get(url, headers=headers.header()) reponse_news.encoding = "utf-8" # 判断在数据库中是否已经存入 if rechecking(max_number, come_from="btc123"): break if reponse_news.status_code == 200: download(reponse_news, url, max_number) max_number -= 1 else: err = reponse_news.status_code mistake(url, err) # 可以重新加载三次网页 if reload == 3: break reload += 1 except: if reload == 3: break reload += 1
def starts(): url = "http://gansu.gscn.com.cn/bwyc/index.html" response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text getURL(html)
def starts(): # 由于快讯网址上的快讯信息不是顺序排列,是乱序排列,采用已有的编号来爬取 n = 8300 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="hangliancj_alerts") if number: n = number + 1 tf = False while True: try: url = "http://hangliancj.com/article/%s.html" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = download(url, reponse, n) if data: break else: err = reponse.status_code mistake(url, err) break # 主要是运行的第一次 if tf: n -= 1 else: n += 1 except TimeoutError: time.sleep(10)
def starts(): urls = [ "http://www.yulefm.com/star/index.html", "http://www.yulefm.com/movie/index.html", "http://www.yulefm.com/v/index.html", "http://www.yulefm.com/music/index.html", "http://www.yulefm.com/shishang/index.html" ] for i in urls: n = 1 while True: if n == 1: url = i else: page = "_" + str(n) + ".html" url = re.sub(".html", page, n) response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def starts(): n = 33588 # 用与判断获取路径是获取上一篇还是下一篇 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="chainfor") if number: n = int(number) + 1 tf = False url = "https://www.chainfor.com/news/show/%s.html" % n while True: reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = getUrl(reponse, tf, url) if data == "end": break elif data == "continue": continue # 返回路径,跟新 url = data else: err = reponse.status_code mistake(url, err) break
def starts(): n = 2016 tf = True # 判断数据库是否已经存在内容 number = max_id(come_from="shangxia_alerts") if number: n = number + 1 tf = False while True: try: url = "https://www.shangxia.net/kuaixun/1/%s.html" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: data = download(reponse, n, url) if data: break else: err = reponse.status_code mistake(url, err) break # 主要是运行的第一次 if tf: n -= 1 else: n += 1 except TimeoutError: time.sleep(10)
def starts(): url = "https://www.kg.com/jinrong" reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: html = reponse.text getUrl(html)
def starts(): urls = [ "http://www.artsbj.com/list-17-%s.html", "http://www.artsbj.com/list-18-%s.html", "http://www.artsbj.com/list-19-%s.html", "http://www.artsbj.com/list-20-%s.html", "http://www.artsbj.com/list-21-%s.html", "http://www.artsbj.com/list-22-%s.html", "http://www.artsbj.com/list-23-%s.html", "http://www.artsbj.com/list-24-%s.html" ] for i in urls: n = 1 while True: url = i % n response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def starts(): urls = [ "http://www.legaldaily.com.cn/index_article/node_5955.htm", "http://www.legaldaily.com.cn/Finance_and_Economics/node_75684.htm", "http://www.legaldaily.com.cn/IT/node_69471.htm", "http://www.legaldaily.com.cn/society/node_55564.htm", "http://www.legaldaily.com.cn/army/node_80560.htm" ] for i in urls: n = 1 while True: if n == 1: url = i else: page = "_" + str(n) + ".htm" url = re.sub(".htm", page, i) response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def starts(): urls = [ "http://www.cqcb.com/highlights/index.json", "http://www.cqcb.com/shishi/index.json", "http://www.cqcb.com/entertainment/index.json", "http://www.cqcb.com/science/index.json" ] for i in urls: n = 1 while True: if n == 1: url = i else: page = "_" + str(n) + ".json" url = re.sub(".json", page, i) response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def starts(): urls = [ "http://www.huaxia.com/xw/dlxw/index.html", "http://www.huaxia.com/xw/twxw/index.html", "http://www.huaxia.com/xw/gaxw/index.html", "http://www.huaxia.com/xw/gjxw/index.html", "http://www.huaxia.com/xw/zhxw/index.html" ] for i in urls: n = 1 while True: if n == 1: url = i else: page = "_" + str(n) + ".html" url = re.sub(".html", page, i) response = requests.get(url, headers=headers.header()) response.encoding = "gbk" if response.status_code == 200: html = response.text mistake = getURL(html) if mistake: break n += 1 else: break
def connect(url, number): url = "https://www.shobserver.com" + url response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text download(html, number)
def starts(): url = "https://idol001.com/" response = requests.get(url, headers=headers.header()) response.encoding = "utf-8" if response.status_code == 200: html = response.text getURL(html)
def starts(): urls = [ "https://www.7234.cn/fetch_articles/news", "https://www.7234.cn/fetch_articles/blockchain", "https://www.7234.cn/fetch_articles/tech", "https://www.7234.cn/fetch_articles/huodong", "https://www.7234.cn/fetch_articles/column" ] for i in urls: n = 1 while True: url = i + "?page=%s" % n reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: # 将json格式html文本转换为字典格式 data = reponse.text data = json.loads(data) html = etree.HTML(data["html"]) data = getUrl(html) if data: break n += 1 else: err = reponse.status_code mistake(url, err) break