예제 #1
0
파일: getCategory.py 프로젝트: Neilfu/NLP
def getProduct(hasSpec=True, hasPrice=True, **cat):
    SUFFIX = "&page=%s&sort=sort_winsdate_desc"
    catUrl = cat["catUrl"]
    dbProductList = openTable(tableName="products")
    totalPages = getCatPageNum(catUrl)
    rule = re.compile(r"id=\"plist\".*?>(.*?)<div class=\"clr\"", re.S)
    for page in range(totalPages):
        try:
            progressBar("getting pages", page, totalPages)
            urlPage = catUrl + SUFFIX
            time.sleep(0.5)
            r = session.get(urlPage % (page + 1))
            listUls = re.findall(rule, r.text)
            soup = BeautifulSoup(listUls[0])
            skuLists = []
            for li in soup.select(".gl-item"):
                product = {}
                product.update(cat)
                product["sku"] = li.find(attrs={"data-sku": True})["data-sku"]
                skuLists.append(product["sku"])
                product["url"] = li.select("div > a")[0]["href"]
                product["name"] = li.select(".p-name")[0].a.em.text
                reBrand = re.findall(r"^(.*?)[\s(]", product["name"])
                if reBrand:
                    product["brand"] = reBrand[0]
                try:
                    if dbProductList.find({u"sku": product["sku"]}).count() > 0:
                        logger.debug("%s exist,skip" % (product["sku"]))
                        continue
                    dbProductList.insert(product)
                    if hasSpec:
                        getProductDetail(product["sku"], product["url"], dbProductList)
                except Exception, e:
                    logger.exception("error in Page:%d, skuid:%s, reason:%s" % (page, product["sku"], str(e)))
                    continue

            if hasPrice:
                updatePrice(skuLists, dbProductList)
        except (KeyboardInterrupt, SystemExit), e:
            logger.critical("app is interrupted, finished pages:%d" % (page))
            break
예제 #2
0
            if u'最早记录' in product:
                lastDate = time.strptime(product[u'最早记录'],'%Y-%m-%d %H:%M')
            else:
                lastDate = firstDate
            #NewestDate:本次解析获取的评论最新时间,OldestDate:本次获取到的最远评论时间
            NewestDate = firstDate
            OldestDate = lastDate                            
            
            myCommentRecord.writeproductHead(skuid)   
            #COMMFILE.write("@@<<<product skuid:%s>>>\n" %(skuid))
            pages = getCommentPages(session,URL %(skuid,1))
            isFirst = True

            if pages > 0 :
                for page in range(pages):
                    progressBar('getting product %s' %(skuid),page+1,pages)
                    try:
                        r = session.get(URL %(skuid,page+1))
                        listDiv = re.findall(ruleDiv,r.text)[0]
                        soup = BeautifulSoup(listDiv)
                        divLists = soup.select('div[class="mc"]')
                        divCount = 0
                        for div in divLists:
                            divCount = divCount + 1
                            try:
                                commentRecord = getCommentRecord(div)
                                #如果评论时间在【lastDate,firstDate】之间,说明已经获取过,跳过
                                currentRecordDate = time.strptime(commentRecord['commentDate'],'%Y-%m-%d %H:%M')
                                if currentRecordDate <= NewestDate and currentRecordDate >  OldestDate:
                                    logger.info('\nskip! product:%s, page:%s, currentRecordDate:%s,NewestDate:%s,OldestDate:%s' 
                                                 %(skuid, page, time.strftime('%Y-%m-%d %H:%M', currentRecordDate),
예제 #3
0
            if field == u'_id':
                line.append(str(row.get(field,'')))
            else:
                line.append(row.get(field,''))
        DUMPFILE.write( '\t'.join(line).encode('utf-8')+'\n')
    DUMPFILE.close()


dbProductList = openTable(dbName='shouji',tableName='productList')
session = requests.Session()
totalPages=getPageNum() 

rule = re.compile(r'id=\"plist\".*?>(.*?)<div class=\"clr\"',re.S)
for page in range(totalPages):
    try:
        progressBar("getting pages",page,totalPages)
        r = session.get(URL %(page+1))
        listUls = re.findall(rule,r.text)
        product={}
        soup = BeautifulSoup(listUls[0])
        skuLists=[]
        for li in soup.select('.gl-item'):
            product={}
            product['sku'] = li.find(attrs={"data-sku":True})['data-sku']
            skuLists.append(product['sku'])
            product['url'] = li.select("div > a")[0]['href']
            try:
                if dbProductList.find({u'sku':product['sku']}).count() >0:
                    logger.debug('%s exist,skip' %(product['sku']))
                    continue
                product.update(getProductDetail(product['url']))