예제 #1
0
파일: get_jd.py 프로젝트: Neilfu/NLP
def getPidList4Cat(hasSpec=False,hasPrice=True,site='',level1Filter=None, level2Filter=None, level3Filter=None):
    catDb = openTable(tableName='catdb')
    for cat in catDb.find():
        if cat['catUrl'] and ((level1Filter and cat['level1'] in level1Filter) \
            or (level2Filter and cat['level2'] in level2Filter) \
            or level3Filter and cat['level3']  in level3Filter):
            #catUrl = cat['catUrl']
            #getProduct(catUrl,cat['level1'],cat['level2'],cat['level3'])
            paraCat = cat
            del paraCat['_id']
            getProduct(hasSpec, hasPrice,**paraCat)
예제 #2
0
파일: mytools.py 프로젝트: Neilfu/NLP
def clearTimeline(strFilter='all'):
    tblProductList = openTable()
    if strFilter =='all':
        dictFilter = {'$or':[{u'最新记录':{'$exists':'true'}},{u'最早记录':{'$exists':'true'}}]}
    else:
        dictFilter = {'sku':strFilter}
    count = 0
    for product in tblProductList.find(dictFilter):
        flag = False
        if u'最新记录' in product:
            del product[u'最新记录']
            flag = True
        if u'最早记录' in product:    
            del product[u'最早记录']
            flag = True
        if flag:
            tblProductList.update({u'sku':product['sku']},product)
            count = count + 1
    return count
예제 #3
0
파일: get_jd.py 프로젝트: Neilfu/NLP
def getProduct(hasSpec=True,hasPrice=True,**cat):
    SUFFIX = '&page=%s&sort=sort_winsdate_desc'
    catUrl = cat['catUrl']
    dbProductList = openTable(tableName='products')
    totalPages = getCatPageNum(catUrl)
    rule = re.compile(r'id=\"plist\".*?>(.*?)<div class=\"clr\"',re.S)
    for page in range(totalPages):
        try:
            progressBar("getting pages",page,totalPages)
            urlPage = catUrl + SUFFIX
            time.sleep(0.5)
            r = session.get(urlPage %(page+1))
            listUls = re.findall(rule,r.text)
            soup = BeautifulSoup(listUls[0])
            skuLists=[]
            for li in soup.select('.gl-item'):
                product = {}
                product.update(cat)
                product['sku'] = li.find(attrs={"data-sku":True})['data-sku']
                skuLists.append(product['sku'])
                product['url'] = li.select("div > a")[0]['href']
                product['name'] = li.select('.p-name')[0].a.em.text
                reBrand = re.findall(r'^(.*?)[\s(]',product['name'])
                if reBrand:
                    product['brand'] = reBrand[0]
                try:
                    if dbProductList.find({u'sku':product['sku']}).count() >0:
                        logger.debug('%s exist,skip' %(product['sku']))
                        continue
                    dbProductList.insert(product)
                    if hasSpec:
                        getProductDetail(product['sku'],product['url'],dbProductList)
                except Exception, e:
                    logger.exception("error in Page:%d, skuid:%s, reason:%s" %(page, product['sku'], str(e)))
                    continue

            if hasPrice:
                updatePrice(skuLists,dbProductList)
        except (KeyboardInterrupt, SystemExit), e:
            logger.critical("app is interrupted, finished pages:%d" %(page))
            break
예제 #4
0
파일: get_jd.py 프로젝트: Neilfu/NLP
def getCategoryUrl(site="",url=""):
    catDb = openTable(tableName='catdb')
    r = session.get(url)
    if not r.text:
        return False

    soup = BeautifulSoup(r.text)
    for level1 in soup.select('.category-item'):
        curLevel1 = level1.select('.mt')[0].text
        curLevel1 = re.sub('\s', '', curLevel1)

        for level2 in level1.select('dl'):
            curLevel2 = level2.select('dt')[0].text
            curLevel2 = re.sub('\s', '', curLevel2)
            for level3 in level2.select('dd a'):
                curLevel3 = re.sub('\s', '', level3.text)
                curlUrl = level3['href']
                retFind = re.findall(r'=(.*)$',curlUrl)
                if retFind:
                    curCatID = retFind[0]
                    catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl, 'site':site})
예제 #5
0
ruleDiv = re.compile(r'(<div id=\"comments-list\".*?)<div class=\"clearfix\"',re.S)

if __name__ == '__main__':
    logger = setLog('INFO')
    logger.debug('log level, %d' %(logger.level))
    
    URL='http://club.jd.com/review/%s-0-%s-0.html'
    session = requests.Session()
    
    commentFilename ='../comments.txt'
    myCommentRecord = CommentRecord(commentFilename)
    
    
    #COMMFILE = open(commentFilename, 'w')
    tblProductList = openTable(dbName='shouji',tableName='productList')

    for product in tblProductList.find({u'操作系统':{'$regex':'Android'}}): 
        try:
            skuid= product['sku']
            
            #firstDate:上次获取到的最新评论时间,lastDate:上次获取到的最远评论时间
            if u'最新记录' in product:
                firstDate = time.strptime(product[u'最新记录'],'%Y-%m-%d %H:%M')
            else:
                firstDate = time.localtime(time.time())
            
            if u'最早记录' in product:
                lastDate = time.strptime(product[u'最早记录'],'%Y-%m-%d %H:%M')
            else:
                lastDate = firstDate