def GetNewsListEs(url, time=dt.datetime.min, label='', maxOverdue=5): soup = News.GetSoup(url, 'lxml') maxPage = int( soup.body.select('div[id="pagerNoDiv"]')[0].select( 'a[class="page-btn"]')[0].previous_sibling.text) newsList = [] overdueCount = 0 maxTime = dt.datetime.min for p in range(1, maxPage + 1): pageUrl = url[0:len(url) - 5] + '_' + str(p) + '.html' try: urlList = GetNewsUrlEs(pageUrl) except Exception as e: News.WriteLog(str(e) + '. url = ' + pageUrl) continue for newsUrl in urlList: try: news = GetNewsEs(newsUrl) news.label = label # if news.time <= time: if news.time.date() < time.date(): overdueCount += 1 else: newsList.append(news) maxTime = news.time if news.time > maxTime else maxTime print(news.url) print(news.time) print(news.title) except Exception as e: News.WriteLog(str(e) + ', url = ' + newsUrl) continue if overdueCount >= maxOverdue: return newsList, maxTime return newsList, maxTime
def GetReportListEs(url, time=dt.datetime.min, label='', maxOverdue=5): maxPage = 1000 newsList = [] overdueCount = 0 maxTime = dt.datetime.min urlList = [] exitTraverse = False for i in range(1, maxPage + 1): jsonUrl = url.replace('&p=x&', '&p=' + str(i) + '&') try: urlSet = GetReportUrlEs(jsonUrl) except Exception as e: News.WriteLog(str(e) + '. url = ' + jsonUrl) continue for u in urlSet: newsTime = dt.datetime.strptime(u[len(u) - 19:len(u) + 1], '%Y-%m-%dT%H:%M:%S') if newsTime.date() < time.date(): overdueCount += 1 if overdueCount >= maxOverdue: exitTraverse = True break if newsTime > time: urlList.append(u) maxTime = newsTime if newsTime > maxTime else maxTime if exitTraverse: break for newsUrl in urlList: try: news = GetNewsEs(newsUrl) news.label = label news.time = dt.datetime.strptime( newsUrl[len(newsUrl) - 19:len(newsUrl) + 1], '%Y-%m-%dT%H:%M:%S') news.url = news.url[0:len(news.url) - 23] newsList.append(news) print(news.url) print(news.time) print(news.title) except Exception as e: News.WriteLog(str(e) + ', url = ' + newsUrl) continue return newsList, maxTime
def SetPara(path, paraDict): try: file = open(path, 'w') for k, v in paraDict.items(): file.write(k + ',' + v + os.linesep) file.flush() except Exception as e: News.WriteLog('Fail to open ' + os.path.join('.', 'config', 'es.txt')) finally: file.close()
def GetPara(path): paraDict = {} try: file = open(path) lines = file.readlines() for line in lines: try: if line[-1] == os.linesep: line = line[0:len(line) - 1] strSplit = line.split(',') paraDict[strSplit[0]] = strSplit[1] except Exception as e: continue except Exception as e: News.WriteLog('Fail to open ' + os.path.join('.', 'config', 'es.txt')) finally: file.close() return paraDict
def Launch(db): parser = thulac.thulac(user_dict=os.path.join('.', 'dict', 'dict'), filt=False, seg_only=True) timeDict = db['CONFIG'].find_one( {'_id': 'CONFIG_ES'}) # timeDict = conn.hgetall('CONFIG_ES') for key in timeDict.keys(): if key != '_id': timeDict[key] = dt.datetime.strptime(timeDict[key], '%Y-%m-%d %H:%M:%S') rem = rm.GetDupRem(db, 7, False) infoList = [{ 'url': 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?' 'type=SR&sty=GGSR&' 'js={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}' '&ps=50&p=x&mkt=0&stat=0&cmd=2&code=&rt=50048771', 'key': 'ggyb', 'label': '东方财富 个股研报', 'method': GetReportListEs }, { 'url': 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?' 'type=SR&sty=HYSR&' 'js={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}' '&ps=50&p=x&mkt=0&stat=0&cmd=2&code=&rt=50048771', 'key': 'hyyb', 'label': '东方财富 行业研报', 'method': GetReportListEs }, { 'url': 'http://stock.eastmoney.com/news/cbkjj.html', 'key': 'bkjj', 'label': '东方财富 板块聚焦', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cggdj.html', 'key': 'ggdj', 'label': '东方财富 个股点睛', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cgsxw.html', 'key': 'gsxw', 'label': '东方财富 公司新闻', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cgspl.html', 'key': 'gspl', 'label': '东方财富 股市评论', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/ccjxw.html', 'key': 'cjxw', 'label': '东方财富 产经新闻', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/czqyw.html', 'key': 'zqyw', 'label': '东方财富 证券要闻', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cgnjj.html', 'key': 'gnjj', 'label': '东方财富 国内经济', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cbktt.html', 'key': 'bktt', 'label': '东方财富 报刊头条', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cssgs.html', 'key': 'ssgs', 'label': '东方财富 上市公司', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/ccjdd.html', 'key': 'cjdd', 'label': '东方财富 财经导读', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cpljh.html', 'key': 'pljh', 'label': '东方财富 评论精华', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/ccyts.html', 'key': 'cyts', 'label': '东方财富 产业透视', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cjjsp.html', 'key': 'jjsp', 'label': '东方财富 经济时评', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/csygc.html', 'key': 'sygc', 'label': '东方财富 商业观察', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/chgyj.html', 'key': 'hgyj', 'label': '东方财富 宏观研究', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cywjh.html', 'key': 'ywjh', 'label': '东方财富 要闻精华', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cgjjj.html', 'key': 'gjjj', 'label': '东方财富 国际经济', 'method': GetNewsListEs }, { 'url': 'http://biz.eastmoney.com/news/csyzx.html', 'key': 'syzx', 'label': '东方财富 商业资讯', 'method': GetNewsListEs }, { 'url': 'http://finance.eastmoney.com/news/cjjxr.html', 'key': 'jjxr', 'label': '东方财富 经济学人', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/czggng.html', 'key': 'zggng', 'label': '东方财富 中国概念股', 'method': GetNewsListEs }, { 'url': 'http://hk.eastmoney.com/news/cggyw.html', 'key': 'ggyw', 'label': '东方财富 港股要闻', 'method': GetNewsListEs }, { 'url': 'http://hk.eastmoney.com/news/cggdd.html', 'key': 'ggdd', 'label': '东方财富 港股导读', 'method': GetNewsListEs }, { 'url': 'http://hk.eastmoney.com/news/csckx.html', 'key': 'sckx', 'label': '东方财富 市场快讯', 'method': GetNewsListEs }, { 'url': 'http://hk.eastmoney.com/news/cgsbd.html', 'key': 'gsbd', 'label': '东方财富 公司报道', 'method': GetNewsListEs }, { 'url': 'http://hk.eastmoney.com/news/cahgdt.html', 'key': 'ahgdt', 'label': '东方财富 AH股动态', 'method': GetNewsListEs }, { 'url': 'http://hk.eastmoney.com/news/cggyj.html', 'key': 'ggyj', 'label': '东方财富 个股研究', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cggjh.html', 'key': 'ggjh', 'label': '东方财富 个股精华', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cdpfx.html', 'key': 'dpfx', 'label': '东方财富 大盘分析', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/czldd.html', 'key': 'zldd', 'label': '东方财富 主力导读', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/czljh.html', 'key': 'zljh', 'label': '东方财富 主力精华', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cgmjj.html', 'key': 'gmjj', 'label': '东方财富 公募基金', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cyzsm.html', 'key': 'yzsm', 'label': '东方财富 游资私募', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/csbjj.html', 'key': 'sbjj', 'label': '东方财富 社保基金', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cbxzj.html', 'key': 'bxzj', 'label': '东方财富 保险资金', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/czlls.html', 'key': 'zlls', 'label': '东方财富 主力论市', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/czlcc.html', 'key': 'zlcc', 'label': '东方财富 主力持仓', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cqfii.html', 'key': 'qfii', 'label': '东方财富 QFII', 'method': GetNewsListEs }, { 'url': 'http://stock.eastmoney.com/news/cqsxt.html', 'key': 'qsxt', 'label': '东方财富 券商信托', 'method': GetNewsListEs }] for info in infoList: key = info['key'] if key not in timeDict.keys(): News.WriteLog('Can not get last update time of ' + key + ', set as default value 2016-01-01 00:00:00.') timeDict[key] = dt.datetime.strptime('2016-01-01 00:00:00', '%Y-%m-%d %H:%M:%S') newsList, maxTime = info['method'](info['url'], timeDict[key], info['label']) for news in newsList: if True: newsDict = { '_id': news.url, 'url': news.url, 'time': news.time, 'title': news.title, 'source': news.source, 'label': news.label, 'abstract': news.abstract, 'secNum': len(news.sectionList) } db['news'].save(newsDict) for section in news.sectionList: secKey = news.url + ',' + str(section.seq) if db['section'].find_one( {'_id': secKey}) is not None: # already exists continue simhash = rm.SimHash(rm.SenVec(section.content, 2), 64) secInfo = rm.DocInfo(secKey, simhash, news.time) masterId = rem.AddDoc(secInfo) parse = [] cut = parser.cut(section.content) for c in cut: parse.append(c[0]) secDict = { '_id': secKey, 'url': news.url, 'time': news.time, 'title': news.title, 'secTitle': section.title, 'content': section.content, 'simhash': simhash, 'parse': parse, 'masterId': ('' if masterId is None else masterId) } db['section'].save(secDict) timeDict[ key] = maxTime if maxTime != dt.datetime.min else timeDict[key] for key in timeDict.keys(): if key != '_id': timeDict[key] = timeDict[key].strftime('%Y-%m-%d %H:%M:%S') db['CONFIG'].save(timeDict) ddd = 0