def getPidList4Cat(hasSpec=False,hasPrice=True,site='',level1Filter=None, level2Filter=None, level3Filter=None): catDb = openTable(tableName='catdb') for cat in catDb.find(): if cat['catUrl'] and ((level1Filter and cat['level1'] in level1Filter) \ or (level2Filter and cat['level2'] in level2Filter) \ or level3Filter and cat['level3'] in level3Filter): #catUrl = cat['catUrl'] #getProduct(catUrl,cat['level1'],cat['level2'],cat['level3']) paraCat = cat del paraCat['_id'] getProduct(hasSpec, hasPrice,**paraCat)
def clearTimeline(strFilter='all'): tblProductList = openTable() if strFilter =='all': dictFilter = {'$or':[{u'最新记录':{'$exists':'true'}},{u'最早记录':{'$exists':'true'}}]} else: dictFilter = {'sku':strFilter} count = 0 for product in tblProductList.find(dictFilter): flag = False if u'最新记录' in product: del product[u'最新记录'] flag = True if u'最早记录' in product: del product[u'最早记录'] flag = True if flag: tblProductList.update({u'sku':product['sku']},product) count = count + 1 return count
def getProduct(hasSpec=True,hasPrice=True,**cat): SUFFIX = '&page=%s&sort=sort_winsdate_desc' catUrl = cat['catUrl'] dbProductList = openTable(tableName='products') totalPages = getCatPageNum(catUrl) rule = re.compile(r'id=\"plist\".*?>(.*?)<div class=\"clr\"',re.S) for page in range(totalPages): try: progressBar("getting pages",page,totalPages) urlPage = catUrl + SUFFIX time.sleep(0.5) r = session.get(urlPage %(page+1)) listUls = re.findall(rule,r.text) soup = BeautifulSoup(listUls[0]) skuLists=[] for li in soup.select('.gl-item'): product = {} product.update(cat) product['sku'] = li.find(attrs={"data-sku":True})['data-sku'] skuLists.append(product['sku']) product['url'] = li.select("div > a")[0]['href'] product['name'] = li.select('.p-name')[0].a.em.text reBrand = re.findall(r'^(.*?)[\s(]',product['name']) if reBrand: product['brand'] = reBrand[0] try: if dbProductList.find({u'sku':product['sku']}).count() >0: logger.debug('%s exist,skip' %(product['sku'])) continue dbProductList.insert(product) if hasSpec: getProductDetail(product['sku'],product['url'],dbProductList) except Exception, e: logger.exception("error in Page:%d, skuid:%s, reason:%s" %(page, product['sku'], str(e))) continue if hasPrice: updatePrice(skuLists,dbProductList) except (KeyboardInterrupt, SystemExit), e: logger.critical("app is interrupted, finished pages:%d" %(page)) break
def getCategoryUrl(site="",url=""): catDb = openTable(tableName='catdb') r = session.get(url) if not r.text: return False soup = BeautifulSoup(r.text) for level1 in soup.select('.category-item'): curLevel1 = level1.select('.mt')[0].text curLevel1 = re.sub('\s', '', curLevel1) for level2 in level1.select('dl'): curLevel2 = level2.select('dt')[0].text curLevel2 = re.sub('\s', '', curLevel2) for level3 in level2.select('dd a'): curLevel3 = re.sub('\s', '', level3.text) curlUrl = level3['href'] retFind = re.findall(r'=(.*)$',curlUrl) if retFind: curCatID = retFind[0] catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl, 'site':site})
ruleDiv = re.compile(r'(<div id=\"comments-list\".*?)<div class=\"clearfix\"',re.S) if __name__ == '__main__': logger = setLog('INFO') logger.debug('log level, %d' %(logger.level)) URL='http://club.jd.com/review/%s-0-%s-0.html' session = requests.Session() commentFilename ='../comments.txt' myCommentRecord = CommentRecord(commentFilename) #COMMFILE = open(commentFilename, 'w') tblProductList = openTable(dbName='shouji',tableName='productList') for product in tblProductList.find({u'操作系统':{'$regex':'Android'}}): try: skuid= product['sku'] #firstDate:上次获取到的最新评论时间,lastDate:上次获取到的最远评论时间 if u'最新记录' in product: firstDate = time.strptime(product[u'最新记录'],'%Y-%m-%d %H:%M') else: firstDate = time.localtime(time.time()) if u'最早记录' in product: lastDate = time.strptime(product[u'最早记录'],'%Y-%m-%d %H:%M') else: lastDate = firstDate