def get_douban_movies(parser,mlist): #print ename #ename ,year = get_title_year(ename) #print ename #ename = "Le domaine des dieux" lastres=[] if len(mlist) ==0: return [] handler = site_handler.get_site_handler("www.douban.com",parser) for m in mlist: res = [] try: url="http://movie.douban.com/subject_search?search_text=" if len(m.ename)>0: lurl="http://movie.douban.com/subject_search?search_text="+m.ename print "get douban",lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,douban,url=",lurl continue reslist = handler.dir_parse(url,page) for ti in reslist: if is_maybe_ok(ti,m): ti.search_key = m.ename print "xxxx" res.append(ti) time.sleep(1) lurl="http://movie.douban.com/subject_search?search_text="+m.cname print "get douban",lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,douban,url=",lurl continue reslist = handler.dir_parse(url,page) for ti in reslist: if is_maybe_ok(ti,m): print "zzz" ti.search_key = m.cname res.append(ti) time.sleep(1) ti = get_most_like(m,res) if ti !=None: print "get it",ti.url,ti.raw,ti.pic_url ti.mid = ti.url.split('subject/')[1].split('/')[0] m.mid = ti.mid print "ppppp",m.mid lastres.append(ti) except Exception,e: traceback.print_exc(sys.stdout) print "ERROR:",e print "ERROR:get douban search error",m.raw print "ERROR:get douban ",m.ename,"//",m.cname continue
def get_imdb_movies(parser,mlist): res = [] if len(mlist) ==0: return [] handler = site_handler.get_site_handler("www.imdb.com",parser) for m in mlist: if has_chinese(m.ename): continue try: url="http://www.imdb.com/find?q=" if len(m.ename)>0: key = m.ename.replace('.',' ') lurl="http://www.imdb.com/find?q="+key print lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,imdb,url=",lurl continue dlist = handler.dir_parse(url,page) for ti in dlist: print "get imdb dir_parse",ti.raw,ti.url,ti.year imdbid = ti.url.split("title/")[1].split("/")[0] if "tt" in imdbid: ti.imdbid = imdbid.replace('tt','') print "m ename",m.ename print "m year",m.year if abs(int(ti.year) - int(m.year)) <=1: m.imdbid = ti.imdbid res.append(ti) print "print add imdb result",ti.url break time.sleep(1) except Exception,e: traceback.print_exc() print "ERROR:",e print "ERROR:get imdb search error",m.raw print "ERROR:get imdb ",m.ename,"//",m.cname continue
def get_douban_movies2(parser,mlist): #print ename #ename ,year = get_title_year(ename) #print ename #ename = "Le domaine des dieux" if len(mlist) ==0: return [] handler = site_handler.get_site_handler("www.douban.com",parser) res = [] for k,v in mlist.items(): try: url="http://movie.douban.com/subject_search?search_text=" if len(k)>0: key = k.replace('.',' ') lurl="http://movie.douban.com/subject_search?search_text="+key print "get douban",lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,douban,url=",lurl continue reslist = handler.dir_parse(url,page) maybelist = [] for ti in reslist: for m in v: if is_maybe_ok(ti,m): ti.search_key = k maybelist.append(ti) ti = get_most_like(v,maybelist) if ti: res.append(ti) time.sleep(1) except Exception,e: traceback.print_exc(sys.stdout) print "ERROR:",e print "ERROR:get douban search error",k continue
def download_pic(mid,pdir): url = "http://api.douban.com/v2/movie/subject/"+str(mid) p = utils.crawl_timeout(url,15,3) page = json.loads(p) #print page if 'images' in page: if 'large' in page['images']: pic_url=page['images']['large'] try: path = pdir+"/"+str(mid)+".jpg" path2 = pdir+"/s/"+str(mid)+".jpg" data = urllib.urlopen(pic_url).read() f = file(path,"wb") f.write(data) f.close() makepic(path,path2) except Exception,e: traceback.print_exc(sys.stdout) print e
def download_pic(mid, pdir): url = "http://api.douban.com/v2/movie/subject/" + str(mid) p = utils.crawl_timeout(url, 15, 3) page = json.loads(p) #print page if 'images' in page: if 'large' in page['images']: pic_url = page['images']['large'] try: path = pdir + "/" + str(mid) + ".jpg" path2 = pdir + "/s/" + str(mid) + ".jpg" data = urllib.urlopen(pic_url).read() f = file(path, "wb") f.write(data) f.close() makepic(path, path2) except Exception, e: traceback.print_exc(sys.stdout) print e
def get_imdb_movies2(parser,mmap): res = [] if len(mmap) ==0: return [] search_list = [] handler = site_handler.get_site_handler("www.imdb.com",parser) for k,v in mmap.items(): if has_chinese(k): continue try: url="http://www.imdb.com/find?q=" if len(k)>0: key = k.replace('.',' ') lurl="http://www.imdb.com/find?q="+key print lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,imdb,url=",lurl continue dlist = handler.dir_parse(url,page) for ti in dlist: for m in v: if abs(int(ti.year) - int(m.year)) <=1: ti.search_key = k res.append(ti) time.sleep(1) except Exception,e: traceback.print_exc() print "ERROR:",e print "ERROR:get imdb search error",k continue
if len(url) < 3: continue if "douban" in url: if url not in doubanurlmap: doubanurlmap[url] = 1 continue elif "imdb" in url: if url not in imdburlmap: #print "get imdb", url imdburlmap[url] = 1 continue try: if "http://" in url: print "craw link", url page = utils.crawl_timeout(url, 15, 3) # print "xxx",lurl if page == None: print "ERROR:", url else: handler = get_site_handler(url, parser) it, urls = handler.parse(url, page) if it != None: if it.cname == None or it.cname == "": it.cname = urllist[2] it.ename = urllist[3] it.date = urllist[4] it.mid = urllist[5] it.imdbid = urllist[6] if it.quality == "": it.quality = urllist[7]
if len(url)<3: continue if "douban" in url: if url not in doubanurlmap: doubanurlmap[url]=1 continue elif "imdb" in url: if url not in imdburlmap: #print "get imdb", url imdburlmap[url]=1 continue try: if "http://" in url: print "craw link",url page=utils.crawl_timeout(url,15,3) # print "xxx",lurl if page ==None: print "ERROR:",url else: handler = get_site_handler(url,parser) it,urls = handler.parse(url,page) if it!=None: if it.cname ==None or it.cname =="": it.cname = urllist[2] it.ename = urllist[3] it.date = urllist[4] it.mid = urllist[5] it.imdbid = urllist[6] if it.quality =="": it.quality = urllist[7]