Exemplo n.º 1
0
def fetchaipai(gameName, startfrom, endto, gameidx, save=True):
    videoCount = 0
    addCount = 0
# open bad link
    urls = []
    with open("%s/pyspider/aipaibadlink2.list" % projectPathBase, "r") as badList:
        for l in badList:
            urls.append(l.strip())
    gameidx = 0

    for idx in range(startfrom, endto):
        url = "%s%s%s%d.html" % ( URLBase, urllib.quote(gameName), URLPage, idx)  
        print url
        if len(url) < 5:
            gameidx = int(url)
            with open("%s/pyspider/aipaibadlink3.list" % projectPathBase, "a") as badfile:
                badfile.write("%d\n" % gameidx)
            continue
        opener = urllib2.OpenerDirector()
        handler = urllib2.HTTPHandler()
        opener.add_handler(handler)
        opener.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)')]
        content_stream = opener.open(url)
        content = content_stream.read()

        store.savePage("indexaipai.html", content)

        soupContent = bs4.BeautifulSoup(content)  
        
        mainContents = soupContent.select(".mod_svlist")
        if len(mainContents) <= 0:
            with open("%s/pyspider/aipaibadlink3.list" % projectPathBase, "a") as badfile:
                badfile.write("%s\n" % url)
            continue
        mainContentList = soupContent.select(".mod_svlist")[0]
        vs = mainContentList.select("li")
        
        if len(vs) <= 0:
            break
        
        hrefList = []
        titleList = []
        log_vidList = []
        playtimeList = []
        pubdateList = []
        uploaderList = []
        playCountList = []
        tagsList = []

        for v in vs:
            linkA = v.select(".com_v_tit")[0]
# title, href            
            title = linkA['title']
            href = linkA['href']
            print title, href

            lastslash = href.rfind("/")
            lastdot = href.rfind(".")
# log_vid
            log_vid = href[lastslash+1:lastdot]
            print log_vid,
# img
            linkImg = v.select(".pic")[0]
            thumbImg = linkImg.select("img")[0]
            thumbImgSrc = thumbImg['src']
#            with open("aipaiImg.list", "a") as imgListFile:
#                imgListFile.write("%s.jpg:%s\n" % (log_vid, thumbImgSrc))
#            urllib.urlretrieve(thumbImgSrc, "%s/%s.jpg" % (storePath, log_vid))
# tags            
            tagsA = v.select(".tag")[0]
            tagsAs = tagsA.select("a")
            tags = []
            for tagA in tagsAs:
                tags.append(tagA.get_text()) 
            tagStr = ";".join(tags)

            infoDiv = v.select(".info")[0]

            infos = infoDiv.select("em")
# playCount            
            playCount = int(infos[0].get_text())
            print playCount,
            datestr = infos[2].get_text()
            print datestr,
            pubdate = None
            noletter = re.sub("[^0-9]", "", datestr)
            if datestr.find(u"周") >= 0 or datestr.find(u"星期") >= 0:
                pubdate = datetime.today()-relativedelta(weeks=int(noletter))
            elif datestr.find(u"天") >= 0:
                pubdate = datetime.today()-relativedelta(days=int(noletter))
            elif datestr.find(u"小时") >= 0: 
                pubdate = datetime.today()-relativedelta(hours=int(noletter))
            elif datestr.find(u"分钟") >= 0: 
                pubdate = datetime.today()-relativedelta(minutes=int(noletter))
            elif datestr.find(u"秒") >= 0: 
                pubdate = datetime.today()-relativedelta(seconds=int(noletter))
            else:
                pubdate = datetime.strptime(datestr, "%Y-%m-%d").date()
# pubdate
            print pubdate

            authorA = infoDiv.select(".hr_zz")[0]
# uploader            
            uploader = authorA.get_text()
            print uploader
            
            hrefList.append(href)
            titleList.append(title)
            log_vidList.append(log_vid)
            pubdateList.append(pubdate)
            uploaderList.append(uploader)
            playCountList.append(playCount)
            tagsList.append(tagStr)
            videoCount = videoCount + 1

        count = len(playCountList)
        if save == True:
            for i in range(count):
                addCount = addCount + store.storeVideo(titleList[i], hrefList[i], log_vidList[i], uploaderList[i], playCountList[i], pubdateList[i], "4", gameidx, tagsList[i], thumbImgSrc)
                print log_vidList[i], uploaderList[i], playCountList[i], pubdateList[i], thumbImgSrc

        time.sleep(5)
    return videoCount, addCount
Exemplo n.º 2
0
def fetchYouku(gameName, startfrom, endto, gameidx, save=True):
    videoCount = 0
    addCount = 0
    for idx in range(startfrom, endto):
        url = "%s%s%s%d%s" % ( sokuURLBase, urllib.quote(gameName),sokuURLPage, idx, sokuURLOther)  
        print url
        
        opener = urllib2.OpenerDirector()
        handler = urllib2.HTTPHandler()
        opener.add_handler(handler)
        opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11')]
        content_stream = opener.open(url)
        content = content_stream.read()

#        store.savePage("index.html", content)
        
        soupContent = bs4.BeautifulSoup(content)  
        
        mainResults = soupContent.select(".sk_result")
        if len(mainResults) <= 0:
            break

        mainResult = mainResults[0]
        vs = mainResult.select(".v")
        if len(vs) <= 0:
            break

        for v in vs:
            meta = v.select(".v-meta")[0]
            metaTitle = meta.select(".v-meta-title")[0]
            linkA = metaTitle.select("a")[0]
            title = linkA['title']
            href = linkA['href']
            log_vid = linkA['_log_vid']
            metaEntry = meta.select(".v-meta-entry")[0]
            metaDatas = metaEntry.select(".v-meta-data")
            thumbDiv = v.select(".v-thumb")[0]
            thumbImg = thumbDiv.select("img")[0]
            thumbImgSrc = thumbImg["src"]
#            urllib.urlretrieve(thumbImgSrc, "%s/%s.jpg" % (storePath, log_vid))
#            with open("youkuPic.list", "a") as picFile:
#                picFile.write("%s.jpg:%s\n" % (log_vid, thumbImgSrc)) 
            
            uploader = ""
            playtime = 0
            publishdate = datetime.today()
            for mdata in metaDatas:
                mlabel = mdata.select("label")[0]
                labelText = mlabel.get_text().strip()
                mspan = mdata.select("span")[0]
                spanText = mspan.get_text().strip()
                if labelText.startswith(u"用户"):
                    uploader = spanText
                elif labelText.startswith(u"播客"):
                    uploader = spanText
                elif labelText.startswith(u"播放"):
                    noletter = re.sub("[^0-9]", "", spanText)
                    playtime = int(noletter)
                elif labelText.startswith(u"发布"):
                    noletter = re.sub("[^0-9]", "", spanText)
                    if spanText.find(u"秒") >= 0:
                        publishdate = publishdate-relativedelta(seconds=int(noletter))
                    elif spanText.find(u"分钟") >= 0:
                        publishdate = publishdate-relativedelta(minutes=int(noletter))
                    elif spanText.find(u"小时") >= 0:
                        publishdate = publishdate-relativedelta(hours=int(noletter))
                    elif spanText.find(u"天") >= 0:
                        publishdate = publishdate-relativedelta(days=int(noletter))
                    elif spanText.find(u"周") >= 0:
                        publishdate = publishdate-relativedelta(weeks=int(noletter))
                    elif spanText.find(u"月") >= 0:
                        publishdate = publishdate-relativedelta(months=int(noletter))
                    elif spanText.find(u"年") >= 0:
                        publishdate = publishdate-relativedelta(years=int(noletter))
            print uploader, playtime, publishdate, thumbImgSrc
            if save == True:
                addCount = addCount + store.storeVideo(title, href, log_vid, uploader, playtime, publishdate, "1", gameidx, "", thumbImgSrc)
            videoCount = videoCount + 1
        time.sleep(5)
    return videoCount, addCount