def getDatas(self, timestampes): try: url = 'http://www.bishijie.com/api/news/?size=100×tamp=' + timestampes response_result = urllib.request.urlopen(url).read() tmp = json.loads(response_result) except Exception as e: return None try: all_div = tmp['data'][utils.gettoday()]['buttom'] except KeyError: return None for item in all_div: newsId = (item['newsflash_id']) newsTime = bishijie.getTimeFromStampe(item['issue_time']) if newsId in self.ids: continue if newsTime.split(" ")[0] != utils.gettoday(): return None content = self.processContent(item) newsurl = item['link'] source = "币世界" insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\"".format( newsId, newsTime, content, newsurl, source) self.dao.saveInfo(tableName=utils.kuaixun_tbl, columesName=utils.kuaixun_columes, values=insertStr)
def update(self): self.ids = self.dao.getIdsBySource(utils.kuaixun_tbl, utils.gettoday(), "金色财经") indexId = self.getDatas(0) while indexId != None: indexId = self.getDatas(indexId) time.sleep(5)
def flush(self, index): cnblogs = self.requestCnblogs(index) if cnblogs == None: return None soup = BeautifulSoup(cnblogs, 'html.parser') all_div = soup.find_all('div', attrs={'class': 'list-art clear'}) for item in all_div: content = self.processContent(item) it = content.split(";") if it[0] in self.titles: continue newstime = re.sub("/", "-", it[2]) if newstime.split(" ")[0] != utils.gettoday(): return None print(content) print('http://www.qukuaiwang.com.cn' + item.a['href']) title = it[0] author = it[1] hots = it[3] img = self.baseUrl + item.img['src'] newsuri = 'http://www.qukuaiwang.com.cn' + item.a['href'] insertStr = "\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",{}".format( title, author, newstime, newsuri, img, hots) self.dao.saveInfo(utils.qukuaiwang_tbl, utils.qukuaiwang_columes, insertStr) time.sleep(5)
def update(self): self.ids = self.dao.getIdsBySource(utils.kuaixun_tbl, utils.gettoday(), "币世界") timest = self.getDatas("") while timest != None: timest = self.getDatas(timest) time.sleep(5)
def getDatas(self, index): url = 'http://www.jinse.com/ajax/weibo/getList?flag=down&id=' + str( index) try: response_result = urllib.request.urlopen(url).read() tmp = json.loads(response_result) except Exception as e: return None try: all_div = tmp['data'] except KeyError: return None for item in all_div: infoId = item['id'] if infoId in self.ids: continue infoDatetime = item['created_at'] name = item['source_uri'] if infoDatetime.split(" ")[0] != utils.gettoday(): return None content = self.processContent(item) source = "weibo" insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\"".format( infoId, name, infoDatetime, content.strip('\n'), source) self.dao.saveInfo(tableName=utils.weibo_tbl, columesName=utils.weibo_columes, values=insertStr) return int(item['id'])
def getDatas(self, index): url = 'http://www.jinse.com/ajax/twitters/getList?flag=down&id=' + str( index) try: response_result = urllib.request.urlopen(url).read() tmp = json.loads(response_result) except Exception as e: return None # 循环div获取详细信息 try: all_div = tmp['data'] except KeyError: return None for item in all_div: # print(item['id']) infoId = item['id'] if infoId in self.ids: continue name = item['source_uri'] infoDatetime = item['published_at'] if infoDatetime.split(" ")[0] != utils.gettoday(): return None content = self.processContent(item) if content and len(content.strip()) == 0: continue source = "twitter" insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\"".format( infoId, name, infoDatetime, content, source) print(insertStr) self.dao.saveInfo(tableName=utils.twitter_tbl, columesName=utils.twitter_columes, values=insertStr) return int(item['id'])
def flush(self,index): cnblogs = self.requestCnblogs(index) if cnblogs ==None: return None soup = BeautifulSoup(cnblogs, 'html.parser') all_div = soup.find_all('div', attrs={'class': 'list-art clear'}) for item in all_div: content = item.text; content = content.strip(); content = re.sub("\n+", ";", content) it = content.split(";") if it[0] in self.titles: continue newstime = re.sub("/", "-", it[2]) if newstime.split(" ")[0] != utils.gettoday(): return None print(content) print('http://www.qukuaiwang.com.cn' + item.a['href']) title = it[0] author = it[1] newsuri = 'http://www.qukuaiwang.com.cn' + item.a['href'] insertStr = "\"{}\",\"{}\",\"{}\",\"{}\"".format(title, author, newstime, newsuri) self.dao.saveInfo("tbl_qukuaiwang","title,author,newstime,newsuri",insertStr) time.sleep(5)
def getDatas(self, index): try: url = 'http://www.jinse.com/ajax/lives/getList?search=&id=' + str( index) + '&flag=down' response_result = urllib.request.urlopen(url).read() except Exception as e: print("error happen") return None tmp = json.loads(response_result) try: all_div = tmp['data'][utils.gettoday()] except KeyError: return None for item in all_div: if item['day_name'] == '今天': infoId = item['id'] sourceurl = item['source_url'] infoDatetime = item['publish_time'] if infoDatetime == None or infoDatetime == "0000-00-00 00:00:00": infoDatetime = utils.gettoday( ) + " " + item['created_at'] + ":00" if infoDatetime.split(" ")[0] != utils.gettoday(): print(infoDatetime) infoDatetime = utils.gettoday( ) + " " + item['created_at'] + ":00" if infoId in self.ids: continue content = item['content'] content = re.sub(r'<[^<]+>', "", content) content = re.sub(r'\"', "\\\"", content) content = re.sub('\n', "", content) print(re.sub("\n", "", content)) insertStr = "{},\"{}\",\"{}\",\"{}\",\"金色财经\"".format( infoId, infoDatetime, content, sourceurl) print(insertStr) self.dao.saveInfo(tableName=utils.kuaixun_tbl, columesName=utils.kuaixun_columes, values=insertStr) else: return None return item['id']
def getTitles(self): return self.dao.getData(utils.qukuaiwang_tbl, "title", utils.gettoday())
def getDatas(self, index): url = 'http://www.jinse.com/ajax/weibo/getList?flag=down&id=' + str( index) try: # response_result = urllib.request.urlopen(url).read() headers = { 'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', # 'Referer': r'http://www.lagou.com/zhaopin/Python/?labelWords=label', 'Connection': 'keep-alive' } req = urllib.request.Request(url, headers=headers) response_result = urllib.request.urlopen(req).read() # response_result = page.decode('utf-8') # req = urllib.request(url) # req.add_header("User-Agent", # "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36") # req.add_header("GET", url) # response_result=req.urlopen(req).read() tmp = json.loads(response_result) except Exception as e: return None try: all_div = tmp['data'] except KeyError: return None for item in all_div: infoId = item['id'] contentImg = item['image_urls'] retweetedImg = item['retweeted_image_urls'] contentImgs = "" for img in contentImg: contentImgs += img['url'] + "," for img in retweetedImg: contentImgs += img['url'] + "," headImg = item['user']['avatar'] if infoId in self.ids: continue infoDatetime = item['created_at'] name = item['source_uri'] if infoDatetime.split(" ")[0] != utils.gettoday(): return None content = self.processContent(item) if item['retweeted_content']: content = content + "," + item['retweeted_content'] if len(content.strip()) == 0: continue source = "weibo" insertStr = "{},\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\"".format( infoId, name, infoDatetime, content.strip('\n'), source, headImg, contentImgs) self.dao.saveInfo(tableName=utils.weibo_tbl, columesName=utils.weibo_columes, values=insertStr) return int(item['id'])
def update(self): self.ids = self.dao.getIds("tbl_weibo", utils.gettoday()) indexId = self.getDatas(0) while indexId != None: indexId = self.getDatas(indexId) time.sleep(5)
def getTitles(self): return self.dao.getData("tbl_qukuaiwang","title",utils.gettoday())