def downloadKidSources(authURL): kidCategories = [] if authURL == "": conn = httplib.HTTPSConnection("www.netflix.com") conn.request(method="GET", url="/Kids", headers=kidHeaderDict) response = conn.getresponse() data = response.read() showHTM = GzipFile(fileobj=StringIO(data)).read() authURL = showHTM[(showHTM.find('"authURL":"')+11):].split('"')[0] jsonPut = '{"paths":[["genreList",{"from":0,"to":40},["id","menuName"]],["genreList","summary"]],"authURL":"'+authURL+'"}' while True: conn = httplib.HTTPSConnection("www.netflix.com") conn.request(method="POST", url="/api/shakti/e6f64e0c/pathEvaluator?withSize=true&materialize=true&model=harris", body=jsonPut, headers=jsonHeaderDict) response = conn.getresponse() data = response.read() jsonString = GzipFile(fileobj=StringIO(data)).read() try: jsonData = json.loads(jsonString) break except: continue for x in range(0, 41): if len(jsonData['value']['genreList'][str(x)]) == 2: kidCategories.append(jsonData['value']['genreList'][str(x)][1]) for kidCategory in kidCategories: conn = httplib.HTTPSConnection("www.netflix.com") conn.request(method="GET", url="/Kids/category/"+kidCategory, headers=kidHeaderDict) response = conn.getresponse() data = response.read() categoryHTM = GzipFile(fileobj=StringIO(data)).read() print("Getting Kids Category " + kidCategories), print(response.status, response.reason) file = open("htm_sources/kid_" + kidCategory, 'wt') file.write(categoryHTM) file.close() return kidCategories
def readIDX(self): self.logger.debug("loading idx file ...") leng = self.__class__.__maxOffsetLen * 2 w, p = [], [] f = None cur = 0 if self.__idxFileName.lower().endswith(".gz"): self.logger.debug("idx file is gzip format!") fmap = GzipFile(self.__idxFileName, "rb").read() else: f = open(self.__idxFileName, "rb") fmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) try: while True: # 避免调用 self.__readUntilZeroEx idx = fmap.find(b"\0", cur) if idx != -1: cur, wordStr = idx + 1, fmap[cur:idx].decode("utf-8") else: wordStr = "" if not wordStr: p.append(pos[0] + pos[1]) break w.append(wordStr) cur, pos = cur + leng, struct.unpack("!II", fmap[cur : cur + leng]) # 避免调用 self.__readNumbers p.append(pos[0]) finally: if hasattr(fmap, "close"): fmap.close() else: # 是大的bytes del fmap if hasattr(f, "close"): f.close() self.logger.debug("len(w)=%d len(p)=%d, %d", len(w), len(p), p[-1]) self.logger.debug("sizeof w is %d, sizeof p is %d", sys.getsizeof(w), sys.getsizeof(p)) self.__wordList, self.__posList = w, p self.logger.debug("idx file loaded.")
def crawlShow(showName, showLink, authURL, videoDict, final_results): if authURL == "": conn = httplib.HTTPSConnection("www.netflix.com") conn.request(method="GET", url="/Kids/title/" + showLink,headers=kidHeaderDict) response = conn.getresponse() data = response.read() showHTM = GzipFile(fileobj=StringIO(data)).read() authURL = showHTM[(showHTM.find('"authURL":"')+11):].split('"')[0] jsonPut = '{"paths":[["videos",'+showLink+',"seasonList",{"from":0,"to":'+str(maxSeasons)+'},"summary"],["videos",'+showLink+',"seasonList","summary"],["videos",'+showLink+',"seasonList","current","episodes",{"from":-1,"to":'+str(maxEpisodes)+'},["summary","synopsis","title","runtime","bookmarkPosition"]],["videos",'+showLink+',"seasonList","current","episodes",{"from":-1,"to":'+str(maxEpisodes)+'},"interestingMoment","_342x192","jpg"],["videos",'+showLink+',"seasonList","current","episodes","summary"],["videos",'+showLink+',"seasonList","current","episodes","current","summary"]],"authURL":"'+authURL+'"}' count = 0 found = False while count < 5: count += 1 conn = httplib.HTTPSConnection("www.netflix.com") conn.request(method="POST", url="/api/shakti/e6f64e0c/pathEvaluator?withSize=true&materialize=true&model=harris", body=jsonPut, headers=jsonHeaderDict) response = conn.getresponse() data = response.read() jsonString = GzipFile(fileobj=StringIO(data)).read() try: jsonData = json.loads(jsonString) found = True break except: continue if not found: return seasonList = [] for x in range(0, maxSeasons+1): if len(jsonData['value']['videos'][showLink]["seasonList"][str(x)]) == 2: seasonList.append(jsonData['value']['videos'][showLink]["seasonList"][str(x)][1]) if len(seasonList) == 0: final_results.write((showName + "\thttp://www.netflix.com/watch/" + showLink + '\n').encode('utf8')) videoDict[showName] = ("http://www.netflix.com/watch/" + showLink).encode('utf8') return for seasonLink in seasonList: jsonPut = '{"paths":[["seasons",'+seasonLink+',"episodes",{"from":-1,"to":'+str(maxEpisodes)+'},["summary","synopsis","title","runtime","bookmarkPosition"]],["seasons",'+seasonLink+',"episodes",{"from":-1,"to":'+str(maxEpisodes)+'},"interestingMoment","_342x192","jpg"],["seasons",'+seasonLink+',"episodes","summary"],["seasons",'+seasonLink+',"episodes","current","summary"]],"authURL":"'+authURL+'"}' while True: conn = httplib.HTTPSConnection("www.netflix.com") conn.request(method="POST", url="/api/shakti/e6f64e0c/pathEvaluator?withSize=true&materialize=true&model=harris", body=jsonPut, headers=jsonHeaderDict) response = conn.getresponse() data = response.read() jsonString = GzipFile(fileobj=StringIO(data)).read() try: jsonData = json.loads(jsonString) break except: continue episodeList = [] for x in range(-1, maxEpisodes+1): if len(jsonData['value']['seasons'][seasonLink]["episodes"][str(x)]) == 2: episodeList.append(jsonData['value']['seasons'][seasonLink]["episodes"][str(x)][1]) for episodeLink in episodeList: seasonNum = str(jsonData['value']['videos'][episodeLink]["summary"]["season"]) episodeNum = str(jsonData['value']['videos'][episodeLink]["summary"]["episode"]) final_results.write((showName + "/Season " + seasonNum + " : Episode " + episodeNum + "\thttp://www.netflix.com/watch/" + episodeLink + '\n').encode('utf8')) videoDict[showName + "/Season " + seasonNum + " : Episode " + episodeNum] = ("http://www.netflix.com/watch/" + episodeLink).encode('utf8')