Пример #1
0
    dlLists = div.select('.dl-extra')[0]('dl')
    commentExtra = {}
    for dl in dlLists:
        title = re.sub(unicode(r'[: #]'),'',dl.dt.text)
        commentExtra[title] = re.sub(r'[\s\r\n#]','',dl.dd.text)
    commentRecord['commentExtra'] = commentExtra
    #解析用户ID名称
    u_name = div.select('.u-name')[0].a
    commentRecord['commentUsername'] = u_name.text.strip()
    commentRecord['commentUserid'] = re.findall(r'com/(.*).html',u_name['href'])[0]
    return commentRecord 

ruleDiv = re.compile(r'(<div id=\"comments-list\".*?)<div class=\"clearfix\"',re.S)

if __name__ == '__main__':
    logger = setLog('INFO')
    logger.debug('log level, %d' %(logger.level))
    
    URL='http://club.jd.com/review/%s-0-%s-0.html'
    session = requests.Session()
    
    commentFilename ='../comments.txt'
    myCommentRecord = CommentRecord(commentFilename)
    
    
    #COMMFILE = open(commentFilename, 'w')
    tblProductList = openTable(dbName='shouji',tableName='productList')

    for product in tblProductList.find({u'操作系统':{'$regex':'Android'}}): 
        try:
            skuid= product['sku']
Пример #2
0
            para[strKey] = True
        if opt in ["-h", "--help"]:
            usage()
            sys.exit()
    return para


def usage():
    print "Usage: python getCategory.py [--help] [--site] [--hasPrice] [--hasSpec] [--homeUrl]  [--level1] [--level2] [--level3]\n"


if __name__ == "__main__":

    retPara = parseCommandLine()

    logger = setLog("INFO")
    logger.debug("log level, %d" % (logger.level))
    session = requests.Session()

    targetSite = retPara.get("site", u"京东")
    targetUrl = retPara.get("homeUrl", "http://www.jd.com/allSort.aspx")
    level1 = retPara.get("level1", None)
    level2 = retPara.get("level2", None)
    level3 = retPara.get("level3", None)
    spec = retPara.get("hasSpec", False)
    price = retPara.get("hasPrice", False)

    getCategoryUrl(site=targetSite, url=targetUrl)
    # getPidList4Cat(hasSpec=False,hasPrice=True,site=targetSite,level1Filter=[u'数码'])
    getPidList4Cat(
        hasSpec=spec, hasPrice=price, site=targetSite, level1Filter=level1, level2Filter=level2, level3Filter=level3