dlLists = div.select('.dl-extra')[0]('dl') commentExtra = {} for dl in dlLists: title = re.sub(unicode(r'[: #]'),'',dl.dt.text) commentExtra[title] = re.sub(r'[\s\r\n#]','',dl.dd.text) commentRecord['commentExtra'] = commentExtra #解析用户ID名称 u_name = div.select('.u-name')[0].a commentRecord['commentUsername'] = u_name.text.strip() commentRecord['commentUserid'] = re.findall(r'com/(.*).html',u_name['href'])[0] return commentRecord ruleDiv = re.compile(r'(<div id=\"comments-list\".*?)<div class=\"clearfix\"',re.S) if __name__ == '__main__': logger = setLog('INFO') logger.debug('log level, %d' %(logger.level)) URL='http://club.jd.com/review/%s-0-%s-0.html' session = requests.Session() commentFilename ='../comments.txt' myCommentRecord = CommentRecord(commentFilename) #COMMFILE = open(commentFilename, 'w') tblProductList = openTable(dbName='shouji',tableName='productList') for product in tblProductList.find({u'操作系统':{'$regex':'Android'}}): try: skuid= product['sku']
para[strKey] = True if opt in ["-h", "--help"]: usage() sys.exit() return para def usage(): print "Usage: python getCategory.py [--help] [--site] [--hasPrice] [--hasSpec] [--homeUrl] [--level1] [--level2] [--level3]\n" if __name__ == "__main__": retPara = parseCommandLine() logger = setLog("INFO") logger.debug("log level, %d" % (logger.level)) session = requests.Session() targetSite = retPara.get("site", u"京东") targetUrl = retPara.get("homeUrl", "http://www.jd.com/allSort.aspx") level1 = retPara.get("level1", None) level2 = retPara.get("level2", None) level3 = retPara.get("level3", None) spec = retPara.get("hasSpec", False) price = retPara.get("hasPrice", False) getCategoryUrl(site=targetSite, url=targetUrl) # getPidList4Cat(hasSpec=False,hasPrice=True,site=targetSite,level1Filter=[u'数码']) getPidList4Cat( hasSpec=spec, hasPrice=price, site=targetSite, level1Filter=level1, level2Filter=level2, level3Filter=level3