def crawDailyStockComments(link,webNet): currentList = [] startContext = HGStockNetSpiderUtils.returnStartContext(link,'<ul class="ul-news-list">') startContext = HGStockNetSpiderUtils.filterContextByTarget(startContext,'<ul','</ul>') len = HGStockNetSpiderUtils.findAllTarget(startContext,'<li>') for i in range(len): targetContext = HGStockNetSpiderUtils.divisionTarget(startContext,'<li>','</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] linkUrl = webNet + HGStockNetSpiderUtils.filterContextByTarget(currentContext,'<a href="','">') title = HGStockNetSpiderUtils.filterContextByTarget(currentContext,'">','</a>') currentYear = str(time.strftime('%Y',time.localtime(time.time())))+'-' pubDate = currentYear + HGStockNetSpiderUtils.filterContextByTarget(currentContext,'[',']') descriptContext = crawDailyDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'STOCK','HGNET']) return currentList
def crawDailyDescriptContext(linkUrl): startContext = HGStockNetSpiderUtils.returnStartContext(linkUrl,'<div class="article_con" id="div-article-content">') filterContext = HGStockNetSpiderUtils.filterContextByTarget(startContext,'<p>','</p>') filterContext = HGStockNetSpiderUtils.removeSpecialCharacter(filterContext) return filterContext