def starts(): url = "http://www.zhilianfm.com/zlfmCms/" reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: # 获取文章的编号 number = getUrl(reponse) while True: if rechecking(number, come_from="zhilianfm_alerts"): break data = download(number) if data: break number -= 1
def getUrl(reponse, url): url_news = url html = reponse.text # 进行筛选 pattern = re.compile('/[a-z]*?/\d+\.html') urls = re.findall(pattern, html) urls = list(set(urls)) for i in urls: pattern = re.compile('\d+') number = re.findall(pattern, i)[0] if rechecking(number, come_from="bibaodao") or int(number) < 1000: break url = url_news + i download(number, url) break
def getUrl(html): # 从中获取新闻的网址 urls = html.xpath('//a/@href') for url in urls: # 进行判断是否是正确网址 pattern = re.compile('(/[\s\S]*?/)(\d+)(#commentBox)') url_num = re.findall(pattern, url) if url_num: number = url_num[0][1] if rechecking(number, come_from="lianshijie7234"): return True url_data = url_num[0][0] + url_num[0][1] data = connect(url_data, number) if data == "end": return True
def getUrl(reponse): html = reponse.text pattern = re.compile('/[^\s]*\.html') urls = re.findall(pattern, html) urls = list(set(urls)) for i in urls: url = "http://shilian.com" + i pattern_num = re.compile('\d+') num = re.findall(pattern_num, url) number = "" for i in num: number += i if rechecking(number, come_from="shilian"): break download(url, number)
def download(reponse): print("coingogo_alerts") html = reponse.text data = json.loads(html) texts = data["list"] for text in texts: number = text["id"] if rechecking(number, come_from="coingogo_alerts"): return True createtime = text["createtime"] timeout = str(createtime["year"]) + "年" + str( createtime["mon"]) + "月" + str(createtime["mday"]) + "日 " + str( createtime["hours"]) + ":" + str( createtime["minutes"]) + " " + createtime["weekday"] storage(number, timeout, text)
def download(reponse): print("bishequ_alerts") html = reponse.text texts = json.loads(html) data = texts["newsList"] for text in data: number = text["id"] if rechecking(number, come_from="bishequ_alerts"): return True content_text = etree.HTML(text["content"]) content = content_text.xpath('//p/text()') if not content: content = content_text.xpath('//p/span/text()') timeout = time.asctime(time.localtime(int(text["createTime"])/1000)) storage(text, content, timeout)
def findNumber(html): html = json.loads(html) data = html["data"] reload = 0 for text in data: number = text["fcNewsId"] if rechecking(number, come_from="huolian"): if reload == 3: return True else: reload += 1 continue data = connent(number) if data: return True
def getUrl(reponse): # 获取新闻的网址 pattern = re.compile('[a-zA-z]+://[^\s]*\.html') urls = re.findall(pattern, reponse.text) urls = list(set(urls)) for url in urls: pattern_live = re.compile("live") num = re.findall(pattern_live, url) if not len(num): # 获取新闻所有的编号 pattern_number = re.compile("\d+") number = re.findall(pattern_number, url)[0] # 判断在数据库是否已经下载过 if rechecking(number, come_from="bitrating"): break download(number, url)
def getUrl(reponse): print("polo321") # 获取排列新闻的信息 html = reponse.text texts = json.loads(html) data = texts["data"] # 获取其中信息 texts = data["list"] # 判断是否获取到信息,如果没有则说明已经超出翻页范围,则结束翻页获取信息, if not texts: return True for text in texts: # 分离出编号,以编号去获取具体新闻内容 number = text["id"] if rechecking(number, come_from="polo321"): return True download(number)
def getUrl(reponse): html = reponse.text pattern = re.compile('/Content/[^\s]*?data=[^\s]*?__2C__2C') urls = re.findall(pattern, html) urls = list(set(urls)) for i in urls: pattern = re.compile('video') video = re.findall(pattern, i) if video: continue url = "https://ihuoqiu.com" + i pattern = re.compile('(/Content/[^\s]*?data=)([^\s]*?__2C__2C)') num = re.findall(pattern, url)[0] number = num[1] if rechecking(number, come_from="ihuoqiu"): continue download(url, number)
def download(reponse, url): try: print("btc123_alerts") html = reponse.text # 将文档的json转换为字典 text = json.loads(html) data = text["data"] for findOne in data: number = findOne["id"] if rechecking(number, "btc123_alerts"): return True # 获取更精确的时间 timeout = findOne["createText"] release_time = UTCTime(timeout) storage(findOne, release_time) except Exception as err: mistake(url, err)
def download(text): try: print("coinvoice_alerts") text = etree.HTML(text) number = text.xpath('//div[@class="date"]/@data-time')[0] if rechecking(number, come_from="coinvoice_alerts"): return True title = text.xpath('//div[@class="title"]/text()')[0] # 时间不够精确 timeout = text.xpath('//div[@class="date"]/text()')[0] timeout_new = UTCTime(timeout) timeout = timeout_new + " --- " + timeout + "前左右" # 获取正文 main_text = text.xpath('//div[@class="summary"]/text()')[0] storage(number, title, timeout, main_text) except Exception as err: mistake(url="http://www.coinvoice.cn/category/kuaixun", err=err)
def getUrl(news): for new in news: url = "http://youjiatuanjian.com" + new reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" html = etree.HTML(reponse.text) if reponse.status_code == 200: # 获取编号 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="youjiatuanjian"): return download(html, number, url) else: err = reponse.status_code mistake(url, err)
def starts(): url = "http://longkuai.com/" reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: # 获取文章的编号 number = getUrl(reponse) while True: if rechecking(number, come_from="longkuai"): break data = connent(number) if data: break number -= 1 else: err = reponse.status_code mistake(url, err)
def connect(number): url = "http://www.bikuai.org/news/%s.html" % number while True: reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: html = etree.HTML(reponse.text) download(html, number) data = downURL(html) url = data pattern_num = re.compile('\d+') number = int(re.findall(pattern_num, url)[0]) if rechecking(number, come_from="bikuai"): break else: err = reponse.status_code mistake(url, err) break
def download(html, url): try: print("huoxing24") # 获取编号 pattern_num = re.compile('\d+') number = re.findall(pattern_num, url)[1] # 判断数据库中是否已经下载过 if rechecking(number, come_from="huoxing24"): return # 匹配发布时间 pattern_time = re.compile('([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))') time = re.findall(pattern_time, html)[0] # print(time) # 匹配标题 pattern_title = re.compile('<h1 style[\s\S]*?</h1>') titles = re.findall(pattern_title, html)[0] title = titles.split()[-2] # print(title) # 匹配副标题 pattern_subhead = re.compile('<h2>[\s\S]*?</h2>') fu_title = re.findall(pattern_subhead, html)[0] subhead = fu_title[4: -5] # print(fu_title) # 匹配文本的来源 pattern_source = re.compile('本文来源: <span>[\s\S]*?</span>') sources = re.findall(pattern_source, html)[0] pattern = re.compile('>[\s\S]*?<') source = re.findall(pattern, sources)[0][1: -1] + "--" + url # print(source) # 匹配作者 pattern_authors = re.compile('<p class="author">[\s\S]*?</p>') authors = re.findall(pattern_authors, html)[0] pattern_author = re.compile('[\u4e00-\u9fa5]+') author = re.findall(pattern_author, authors)[0] # print(author) # 匹配新闻信息 down_page = etree.HTML(html) texts = down_page.xpath('//div[@class=""]')[0] text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split() # print(text) # 进行存储信息 storage(title, author, subhead, time, source, text, number) except Exception as err: mistake(url, err)
def connect(url, reload): # 进行连接 reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf8" if reponse.status_code == 200: pattern_num = re.compile("\d+") number = re.findall(pattern_num, url)[0] if rechecking(number, come_from="zaoping"): # 新闻信息没有顺序规律,进行3次判断,如果连续3次就结束运行 if reload == 3: return "end" else: reload += 1 return "continue" print("zaoping") download(reponse, number) else: err = reponse.status_code mistake(url, err) return "end"
def getUrl(html): urls = html.xpath( '/html/body/section/div[1]/div/article/header/h2/a/@href') # 获取文章对应的作者 authors = html.xpath( '/html/body/section/div[1]/div/article/p[1]/span[1]/text()') for url, author in zip(urls, authors): reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" # 文章对应的编号 pattern = re.compile('\d+') number = re.findall(pattern, url)[0] if rechecking(number, come_from="epcnn"): return True if reponse.status_code == 200: html = etree.HTML(reponse.text) download(html, author, number, url) else: err = reponse.status_code mistake(url, err)
def download(url, html): try: # 匹配快讯发布时间 time = html.xpath( '//*[@id="kuaixun-wrap"]/div/div[1]/div[1]/text()')[0] texts = html.xpath('//*[@id="view"]/li') n = 1 for text in texts: # 获取每条快讯的唯一编号 num = html.xpath( '//*[@id="view"]/li[%s]/a/div[1]/div[2]/@onclick' % n) pattern = re.compile('\d+') number = re.findall(pattern, num[0])[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="youjiatuanjian_alerts"): return downloadOneMessage(text, time, url, number) n += 1 except Exception as err: mistake(url, err)
def download(reponse, url): try: print("weilaicaijing_alerts") html = reponse.text # 将文档的json转换为字典 text = json.loads(html) data = text["data"][0] timeout = data["time"] kuaixun_list = data["list"] for findOne in kuaixun_list: number = findOne["id"] if rechecking(number, "weilaicaijing_alerts"): return True time_hour = findOne["hour"] release_time = timeout + " " + time_hour pattern = re.compile("【[\s\S]*?】") title = re.findall(pattern, findOne["text"])[0] storage(findOne, release_time, number, title) except Exception as err: mistake(url, err)
def connect(url): # 循环获取网页 while True: pattern = re.compile('\d+') number = re.findall(pattern, url)[0] if rechecking(number, come_from="daoqm"): break reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: html = etree.HTML(reponse.text) data = download(html, number) if data: break data = downURL(html) url = data else: err = reponse.status_code mistake(url, err) break
def connent(number): # 连接网址,循环进行获取数据 url = "http://www.haitunbc.com/page68.html?article_id=%s" % number while True: pattern = re.compile('\d+') number = re.findall(pattern, url)[1] if rechecking(number, come_from="haitunbc"): break reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: download(reponse, number, url) data = gainBelowUrl(reponse) if data == "end": break url = data else: err = reponse.status_code mistake(url, err) break
def connect(number): while True: # 循环获取信息 try: url = "http://www.btc798.com/articles/%s.html" % number if rechecking(number, come_from="btc798"): break reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" except TimeoutError: time.sleep(10) continue if reponse.status_code == 200: html = etree.HTML(reponse.text) download(html, number) data = downURL(html) number = data else: err = reponse.status_code mistake(url, err) break
def getUrl(html): # 拆分获取数据 pattern = re.compile('<div class="desc">[\s\S]*?</time>') texts = re.findall(pattern, html) for text in texts: # 得到URL以及发布时间 pattern_url = re.compile('[a-zA-z]+://[^\s]*\.html') url = re.findall(pattern_url, text)[0] pattern_num = re.compile('\d+') number = int(re.findall(pattern_num, url)[0]) if rechecking(number, come_from="budkr"): break pattern_time = re.compile('\d+-\d+-\d+ \d+:\d+') timeout = re.findall(pattern_time, text)[0] reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: download(reponse, timeout, number) else: err = reponse.status_code mistake(url, err)
def starts(): reload = 0 url = "http://www.leilook.com/" reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" if reponse.status_code == 200: # 获取文章的编号 number = getUrl(reponse) while True: if rechecking(number, come_from="leilook"): break data = connent(number, reload) if data == "pictrue": number -= 1 continue elif data == "over": break number -= 1 else: err = reponse.status_code mistake(url, err)
def getUrl(reponse): html = reponse.text pattern = re.compile('/news/\d+') urls = re.findall(pattern, html) urls = list(set(urls)) for i in urls: url = "http://www.coingogo.com" + i reponse = requests.get(url, headers=headers.header()) reponse.encoding = "utf-8" pattern = re.compile('\d+') # 获取信息的编号 number = re.findall(pattern, url)[0] if rechecking(number, come_from="coingogo"): continue if reponse.status_code == 200: data = download(reponse, number, url) if data: continue else: err = reponse.status_code mistake(url, err)
def download(reponse_branch, url, branch): try: print("hecaijiing") # 获取编号 pattern = re.compile('\d+') number = re.findall(pattern, url)[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="hecaijing"): return html = etree.HTML(reponse_branch.text) # 获取标题、时间、作者、来源、内容 title = html.xpath('/html/body/div[5]/div[1]/h1/text()')[0] author_compile = html.xpath('/html/body/div[5]/div[1]/p[2]/span[1]/text()')[0] author_name = html.xpath('/html/body/div[5]/div[1]/p[2]/span[2]/text()')[0].split()[0] author = author_compile+author_name times = html.xpath('/html/body/div[5]/div[1]/p[1]/text()')[0].split() time = times[1] source = ("核财经-%s:" % branch) + url + "--" + times[0] main_text = html.xpath('/html/body/div[5]/div[1]/div[3]')[0] text = etree.tostring(main_text, method="text", encoding="utf8").decode("utf8").split() storage(number, title, author, time, source, text) except Exception as err: mistake(url, err)
def download(html, time, n, text): try: print("fn_alerts") # 获取编号 source_url = html.xpath( '//*[@id="wrap"]/div/div/div/div[2]/div[%s]/div[1]/h2/a/@href' % n)[0] pattern_num = re.compile('\d+') number = re.findall(pattern_num, source_url)[0] # 判断数据库中是否已经下载过 if rechecking(number, come_from="fn_alerts"): return # 获取文本 main_text = etree.tostring(text, method="text", encoding="utf8").decode("utf8").split() n += 1 # 获取标题、时间、来源、内容 title = main_text[1] timeout = time + " " + main_text[0] mains = main_text[2:-8] source = "FN资讯:" + source_url storage(number, title, timeout, source, mains) except Exception as err: mistake(text, err)
def download(reponse): try: print("babifinance") html = reponse.text texts = json.loads(html) for text in texts: number = text["id"] if rechecking(number, come_from="babifinance"): return True # 分离出所需要的信息 data = text["content"].split() pattern = re.compile("来源") exist = re.findall(pattern, data[0]) if exist: source = data[0] reload = -1 while True: statement = data[reload] if statement != " ": statement = data[reload] pattern = re.compile("作者") exist = re.findall(pattern, data[0]) if exist: author = data[0] main = data[1: reload] storage(author, source, statement, main, text) break else: reload -= 1 author = data[reload] while True: if author != " ": pattern = re.compile('编辑|作者') exist = re.findall(pattern, author) if exist: if len(author) < 30: main = data[1: reload] storage(author, source, statement, main, text) break else: reload += 1 author = "BABI财经" main = data[1: reload] storage(author, source, statement, main, text) break else: author = "BABI财经" reload += 1 main = data[1: reload] storage(author, source, statement, main, text) break else: reload -= 1 break else: reload -= 1 else: source = "BABI财经" reload = -1 while True: statement = data[reload] if statement != " ": statement = data[reload] pattern = re.compile("作者") exist = re.findall(pattern, data[0]) if exist: author = exist[0] main = data[1: reload] storage(author, source, statement, main, text) break else: reload -= 1 author = data[reload] while True: if author != " ": pattern = re.compile('编辑|作者') exist = re.findall(pattern, author) if exist: if len(author) < 30: main = data[1: reload] storage(author, source, statement, main, text) break else: reload += 1 author = "BABI财经" main = data[1: reload] storage(author, source, statement, main, text) break else: author = "BABI财经" reload += 1 main = data[: reload] storage(author, source, statement, main, text) break else: reload -= 1 break else: reload -= 1 except Exception as err: mistake(url="http://www.babifinance.com/", err=err)