def scrapeSubGenre(cookies, callstackpath, maxrequestsperminute, metapath, url): response = utils.makeGetRequest(url, cookies, callstackpath, maxrequestsperminute) apimatch = re.compile('\"BUILD_IDENTIFIER\":\"(.*?)\".*?\"SHAKTI_API_ROOT\":\"(.*?)\"', re.DOTALL).findall(response) apiurl = "" for build, root in apimatch: apiurl = root + "/" + build if apiurl != "": fh = open(os.path.join(metapath, "apiurl"), 'w') fh.write(apiurl) fh.close() if '<div id="subGenres"' in response: response = response[response.index('<div id="subGenres"'):] matches = re.compile("<a.*?WiGenre\\?agid=(.*?)\\&.*?\">.*?<span>(.*?)</span>.*?</a>", re.DOTALL).findall(response) subGenres = "" data = collections.OrderedDict() for genreid, genrename in matches: #if subGenres != "": # subGenres += "," #subGenres += "'" + genrename + "':'" + genreid + "'" data[utils.cleanstring(genrename)] = genreid #if subGenres != "": if len(data) > 0: #subGenres = "Genres = {" + subGenres + "}" subGenres = json.dumps(data) fh = open(os.path.join(metapath, "genres", genreid + ".json"), 'w') fh.write(subGenres) fh.close()
def scrapeGenres(cookies, callstackpath, maxrequestsperminute, metapath, cacheage): if not os.path.isdir(os.path.join(metapath, "active")): os.mkdir(os.path.join(metapath, "active")) if not os.path.exists(os.path.join(metapath, "active", "scrape_genres")): fh = open(os.path.join(metapath, "active", "scrape_genres"), 'w') fh.write("currently scraping Genres") fh.close() response = utils.makeGetRequest('http://www.netflix.com', cookies, callstackpath, maxrequestsperminute) matches = re.compile("<li><a href=\"(.*?)WiGenre\\?agid=(.*?)\">(.*?)</a></li>", re.DOTALL).findall(response) genrefile = os.path.join(metapath, "genres", "genres.json") genres = "" data = collections.OrderedDict() for url, genreid, genrename in matches: print "Netflix: DEBUG: " + url url = "http://www.netflix.com/WiGenre?agid=" + genreid data[utils.cleanstring(genrename)] = genreid UpdateSubGenres = False if os.path.exists(os.path.join(metapath, "Genres", genreid + ".json")): oneday = 24 * 60 * 60 if utils.fileIsOlderThan(os.path.join(metapath, "Genres", genreid + ".json"), (oneday * int(cacheage))): UpdateSubGenres = True else: UpdateSubGenres = True if(UpdateSubGenres): scrapeSubGenre(cookies, callstackpath, maxrequestsperminute, metapath, url) #if genres != "": if len(data) > 0: #genres = "{" + genres + "}" genres = json.dumps(data) fh = open(genrefile, 'w') fh.write(genres) fh.close() os.remove(os.path.join(metapath, "active", "scrape_genres"))