def fetchaipai(gameName, startfrom, endto, gameidx, save=True): videoCount = 0 addCount = 0 # open bad link urls = [] with open("%s/pyspider/aipaibadlink2.list" % projectPathBase, "r") as badList: for l in badList: urls.append(l.strip()) gameidx = 0 for idx in range(startfrom, endto): url = "%s%s%s%d.html" % ( URLBase, urllib.quote(gameName), URLPage, idx) print url if len(url) < 5: gameidx = int(url) with open("%s/pyspider/aipaibadlink3.list" % projectPathBase, "a") as badfile: badfile.write("%d\n" % gameidx) continue opener = urllib2.OpenerDirector() handler = urllib2.HTTPHandler() opener.add_handler(handler) opener.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)')] content_stream = opener.open(url) content = content_stream.read() store.savePage("indexaipai.html", content) soupContent = bs4.BeautifulSoup(content) mainContents = soupContent.select(".mod_svlist") if len(mainContents) <= 0: with open("%s/pyspider/aipaibadlink3.list" % projectPathBase, "a") as badfile: badfile.write("%s\n" % url) continue mainContentList = soupContent.select(".mod_svlist")[0] vs = mainContentList.select("li") if len(vs) <= 0: break hrefList = [] titleList = [] log_vidList = [] playtimeList = [] pubdateList = [] uploaderList = [] playCountList = [] tagsList = [] for v in vs: linkA = v.select(".com_v_tit")[0] # title, href title = linkA['title'] href = linkA['href'] print title, href lastslash = href.rfind("/") lastdot = href.rfind(".") # log_vid log_vid = href[lastslash+1:lastdot] print log_vid, # img linkImg = v.select(".pic")[0] thumbImg = linkImg.select("img")[0] thumbImgSrc = thumbImg['src'] # with open("aipaiImg.list", "a") as imgListFile: # imgListFile.write("%s.jpg:%s\n" % (log_vid, thumbImgSrc)) # urllib.urlretrieve(thumbImgSrc, "%s/%s.jpg" % (storePath, log_vid)) # tags tagsA = v.select(".tag")[0] tagsAs = tagsA.select("a") tags = [] for tagA in tagsAs: tags.append(tagA.get_text()) tagStr = ";".join(tags) infoDiv = v.select(".info")[0] infos = infoDiv.select("em") # playCount playCount = int(infos[0].get_text()) print playCount, datestr = infos[2].get_text() print datestr, pubdate = None noletter = re.sub("[^0-9]", "", datestr) if datestr.find(u"周") >= 0 or datestr.find(u"星期") >= 0: pubdate = datetime.today()-relativedelta(weeks=int(noletter)) elif datestr.find(u"天") >= 0: pubdate = datetime.today()-relativedelta(days=int(noletter)) elif datestr.find(u"小时") >= 0: pubdate = datetime.today()-relativedelta(hours=int(noletter)) elif datestr.find(u"分钟") >= 0: pubdate = datetime.today()-relativedelta(minutes=int(noletter)) elif datestr.find(u"秒") >= 0: pubdate = datetime.today()-relativedelta(seconds=int(noletter)) else: pubdate = datetime.strptime(datestr, "%Y-%m-%d").date() # pubdate print pubdate authorA = infoDiv.select(".hr_zz")[0] # uploader uploader = authorA.get_text() print uploader hrefList.append(href) titleList.append(title) log_vidList.append(log_vid) pubdateList.append(pubdate) uploaderList.append(uploader) playCountList.append(playCount) tagsList.append(tagStr) videoCount = videoCount + 1 count = len(playCountList) if save == True: for i in range(count): addCount = addCount + store.storeVideo(titleList[i], hrefList[i], log_vidList[i], uploaderList[i], playCountList[i], pubdateList[i], "4", gameidx, tagsList[i], thumbImgSrc) print log_vidList[i], uploaderList[i], playCountList[i], pubdateList[i], thumbImgSrc time.sleep(5) return videoCount, addCount
def fetchYouku(gameName, startfrom, endto, gameidx, save=True): videoCount = 0 addCount = 0 for idx in range(startfrom, endto): url = "%s%s%s%d%s" % ( sokuURLBase, urllib.quote(gameName),sokuURLPage, idx, sokuURLOther) print url opener = urllib2.OpenerDirector() handler = urllib2.HTTPHandler() opener.add_handler(handler) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11')] content_stream = opener.open(url) content = content_stream.read() # store.savePage("index.html", content) soupContent = bs4.BeautifulSoup(content) mainResults = soupContent.select(".sk_result") if len(mainResults) <= 0: break mainResult = mainResults[0] vs = mainResult.select(".v") if len(vs) <= 0: break for v in vs: meta = v.select(".v-meta")[0] metaTitle = meta.select(".v-meta-title")[0] linkA = metaTitle.select("a")[0] title = linkA['title'] href = linkA['href'] log_vid = linkA['_log_vid'] metaEntry = meta.select(".v-meta-entry")[0] metaDatas = metaEntry.select(".v-meta-data") thumbDiv = v.select(".v-thumb")[0] thumbImg = thumbDiv.select("img")[0] thumbImgSrc = thumbImg["src"] # urllib.urlretrieve(thumbImgSrc, "%s/%s.jpg" % (storePath, log_vid)) # with open("youkuPic.list", "a") as picFile: # picFile.write("%s.jpg:%s\n" % (log_vid, thumbImgSrc)) uploader = "" playtime = 0 publishdate = datetime.today() for mdata in metaDatas: mlabel = mdata.select("label")[0] labelText = mlabel.get_text().strip() mspan = mdata.select("span")[0] spanText = mspan.get_text().strip() if labelText.startswith(u"用户"): uploader = spanText elif labelText.startswith(u"播客"): uploader = spanText elif labelText.startswith(u"播放"): noletter = re.sub("[^0-9]", "", spanText) playtime = int(noletter) elif labelText.startswith(u"发布"): noletter = re.sub("[^0-9]", "", spanText) if spanText.find(u"秒") >= 0: publishdate = publishdate-relativedelta(seconds=int(noletter)) elif spanText.find(u"分钟") >= 0: publishdate = publishdate-relativedelta(minutes=int(noletter)) elif spanText.find(u"小时") >= 0: publishdate = publishdate-relativedelta(hours=int(noletter)) elif spanText.find(u"天") >= 0: publishdate = publishdate-relativedelta(days=int(noletter)) elif spanText.find(u"周") >= 0: publishdate = publishdate-relativedelta(weeks=int(noletter)) elif spanText.find(u"月") >= 0: publishdate = publishdate-relativedelta(months=int(noletter)) elif spanText.find(u"年") >= 0: publishdate = publishdate-relativedelta(years=int(noletter)) print uploader, playtime, publishdate, thumbImgSrc if save == True: addCount = addCount + store.storeVideo(title, href, log_vid, uploader, playtime, publishdate, "1", gameidx, "", thumbImgSrc) videoCount = videoCount + 1 time.sleep(5) return videoCount, addCount