def getProduct(hasSpec=True, hasPrice=True, **cat): SUFFIX = "&page=%s&sort=sort_winsdate_desc" catUrl = cat["catUrl"] dbProductList = openTable(tableName="products") totalPages = getCatPageNum(catUrl) rule = re.compile(r"id=\"plist\".*?>(.*?)<div class=\"clr\"", re.S) for page in range(totalPages): try: progressBar("getting pages", page, totalPages) urlPage = catUrl + SUFFIX time.sleep(0.5) r = session.get(urlPage % (page + 1)) listUls = re.findall(rule, r.text) soup = BeautifulSoup(listUls[0]) skuLists = [] for li in soup.select(".gl-item"): product = {} product.update(cat) product["sku"] = li.find(attrs={"data-sku": True})["data-sku"] skuLists.append(product["sku"]) product["url"] = li.select("div > a")[0]["href"] product["name"] = li.select(".p-name")[0].a.em.text reBrand = re.findall(r"^(.*?)[\s(]", product["name"]) if reBrand: product["brand"] = reBrand[0] try: if dbProductList.find({u"sku": product["sku"]}).count() > 0: logger.debug("%s exist,skip" % (product["sku"])) continue dbProductList.insert(product) if hasSpec: getProductDetail(product["sku"], product["url"], dbProductList) except Exception, e: logger.exception("error in Page:%d, skuid:%s, reason:%s" % (page, product["sku"], str(e))) continue if hasPrice: updatePrice(skuLists, dbProductList) except (KeyboardInterrupt, SystemExit), e: logger.critical("app is interrupted, finished pages:%d" % (page)) break
if u'最早记录' in product: lastDate = time.strptime(product[u'最早记录'],'%Y-%m-%d %H:%M') else: lastDate = firstDate #NewestDate:本次解析获取的评论最新时间,OldestDate:本次获取到的最远评论时间 NewestDate = firstDate OldestDate = lastDate myCommentRecord.writeproductHead(skuid) #COMMFILE.write("@@<<<product skuid:%s>>>\n" %(skuid)) pages = getCommentPages(session,URL %(skuid,1)) isFirst = True if pages > 0 : for page in range(pages): progressBar('getting product %s' %(skuid),page+1,pages) try: r = session.get(URL %(skuid,page+1)) listDiv = re.findall(ruleDiv,r.text)[0] soup = BeautifulSoup(listDiv) divLists = soup.select('div[class="mc"]') divCount = 0 for div in divLists: divCount = divCount + 1 try: commentRecord = getCommentRecord(div) #如果评论时间在【lastDate,firstDate】之间,说明已经获取过,跳过 currentRecordDate = time.strptime(commentRecord['commentDate'],'%Y-%m-%d %H:%M') if currentRecordDate <= NewestDate and currentRecordDate > OldestDate: logger.info('\nskip! product:%s, page:%s, currentRecordDate:%s,NewestDate:%s,OldestDate:%s' %(skuid, page, time.strftime('%Y-%m-%d %H:%M', currentRecordDate),
if field == u'_id': line.append(str(row.get(field,''))) else: line.append(row.get(field,'')) DUMPFILE.write( '\t'.join(line).encode('utf-8')+'\n') DUMPFILE.close() dbProductList = openTable(dbName='shouji',tableName='productList') session = requests.Session() totalPages=getPageNum() rule = re.compile(r'id=\"plist\".*?>(.*?)<div class=\"clr\"',re.S) for page in range(totalPages): try: progressBar("getting pages",page,totalPages) r = session.get(URL %(page+1)) listUls = re.findall(rule,r.text) product={} soup = BeautifulSoup(listUls[0]) skuLists=[] for li in soup.select('.gl-item'): product={} product['sku'] = li.find(attrs={"data-sku":True})['data-sku'] skuLists.append(product['sku']) product['url'] = li.select("div > a")[0]['href'] try: if dbProductList.find({u'sku':product['sku']}).count() >0: logger.debug('%s exist,skip' %(product['sku'])) continue product.update(getProductDetail(product['url']))