def crawFinanceHLDataSource(link): currentList = [] target ='<div class="show">' startContext = TakFinanceHLNetSpiderUtils.returnStartContext(link,target) startContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,target, '<div class="hot">') startContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(startContext) linkUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<ahref="','"target') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext,'src') imageUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'class="','"/></a>') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext,'<h3>') title = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'blank">','</a>') descriptContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'</h3>','</span>') descriptContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(descriptContext) pubDate = time.strftime("%Y-%m-%d",time.localtime()) currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','CCTVCHINA']) return currentList
def crawFinanceHLDataSource(link): currentList = [] target = '<div class="p3_box">' startContext = TakFinanceHLNetSpiderUtils.returnStartContext(link, target) startContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, target, '<div class="clear"></div>') startContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(startContext) linkUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, '<ahref="', '"target') title = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, 'blank">', "</a></div>") imageUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, '<imgsrc="', '"border') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext, '<divclass="summary">') descriptContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, "", "<ahref") pubDate = time.strftime("%Y-%m-%d %X", time.localtime()) currentList.append([str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, "MACRO", "TAKCHINA"]) return currentList
def crawFinanceHLDataSource(link): currentList = [] target ='<ul class="articles unorderList unorderList-orange">' startContext = TakFinanceHLNetSpiderUtils.returnStartContext(link,target) startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext, target) startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext, target) startContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<li','</li>') startContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(startContext) linkUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<ahref="','"target') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext,'title="') title = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'>','</a>') pubDate = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<span>','</span>') pubDate = pubDate[:10]+' '+pubDate[10:] imageUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'src="','"width') descriptContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<pclass="articleMaterial_digest_3row">','</div>') currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','NBDCHINA']) return currentList