def get_douban_movies(parser,mlist): #print ename #ename ,year = get_title_year(ename) #print ename #ename = "Le domaine des dieux" lastres=[] if len(mlist) ==0: return [] handler = site_handler.get_site_handler("www.douban.com",parser) for m in mlist: res = [] try: url="http://movie.douban.com/subject_search?search_text=" if len(m.ename)>0: lurl="http://movie.douban.com/subject_search?search_text="+m.ename print "get douban",lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,douban,url=",lurl continue reslist = handler.dir_parse(url,page) for ti in reslist: if is_maybe_ok(ti,m): ti.search_key = m.ename print "xxxx" res.append(ti) time.sleep(1) lurl="http://movie.douban.com/subject_search?search_text="+m.cname print "get douban",lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,douban,url=",lurl continue reslist = handler.dir_parse(url,page) for ti in reslist: if is_maybe_ok(ti,m): print "zzz" ti.search_key = m.cname res.append(ti) time.sleep(1) ti = get_most_like(m,res) if ti !=None: print "get it",ti.url,ti.raw,ti.pic_url ti.mid = ti.url.split('subject/')[1].split('/')[0] m.mid = ti.mid print "ppppp",m.mid lastres.append(ti) except Exception,e: traceback.print_exc(sys.stdout) print "ERROR:",e print "ERROR:get douban search error",m.raw print "ERROR:get douban ",m.ename,"//",m.cname continue
def get_imdb_movies(parser,mlist): res = [] if len(mlist) ==0: return [] handler = site_handler.get_site_handler("www.imdb.com",parser) for m in mlist: if has_chinese(m.ename): continue try: url="http://www.imdb.com/find?q=" if len(m.ename)>0: key = m.ename.replace('.',' ') lurl="http://www.imdb.com/find?q="+key print lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,imdb,url=",lurl continue dlist = handler.dir_parse(url,page) for ti in dlist: print "get imdb dir_parse",ti.raw,ti.url,ti.year imdbid = ti.url.split("title/")[1].split("/")[0] if "tt" in imdbid: ti.imdbid = imdbid.replace('tt','') print "m ename",m.ename print "m year",m.year if abs(int(ti.year) - int(m.year)) <=1: m.imdbid = ti.imdbid res.append(ti) print "print add imdb result",ti.url break time.sleep(1) except Exception,e: traceback.print_exc() print "ERROR:",e print "ERROR:get imdb search error",m.raw print "ERROR:get imdb ",m.ename,"//",m.cname continue
def get_douban_movies2(parser,mlist): #print ename #ename ,year = get_title_year(ename) #print ename #ename = "Le domaine des dieux" if len(mlist) ==0: return [] handler = site_handler.get_site_handler("www.douban.com",parser) res = [] for k,v in mlist.items(): try: url="http://movie.douban.com/subject_search?search_text=" if len(k)>0: key = k.replace('.',' ') lurl="http://movie.douban.com/subject_search?search_text="+key print "get douban",lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,douban,url=",lurl continue reslist = handler.dir_parse(url,page) maybelist = [] for ti in reslist: for m in v: if is_maybe_ok(ti,m): ti.search_key = k maybelist.append(ti) ti = get_most_like(v,maybelist) if ti: res.append(ti) time.sleep(1) except Exception,e: traceback.print_exc(sys.stdout) print "ERROR:",e print "ERROR:get douban search error",k continue
def get_imdb_movies2(parser,mmap): res = [] if len(mmap) ==0: return [] search_list = [] handler = site_handler.get_site_handler("www.imdb.com",parser) for k,v in mmap.items(): if has_chinese(k): continue try: url="http://www.imdb.com/find?q=" if len(k)>0: key = k.replace('.',' ') lurl="http://www.imdb.com/find?q="+key print lurl page = utils.crawl_timeout(lurl,15,3) if page ==None: print "ERROR,timeout,imdb,url=",lurl continue dlist = handler.dir_parse(url,page) for ti in dlist: for m in v: if abs(int(ti.year) - int(m.year)) <=1: ti.search_key = k res.append(ti) time.sleep(1) except Exception,e: traceback.print_exc() print "ERROR:",e print "ERROR:get imdb search error",k continue
continue elif "imdb" in url: if url not in imdburlmap: #print "get imdb", url imdburlmap[url] = 1 continue try: if "http://" in url: print "craw link", url page = utils.crawl_timeout(url, 15, 3) # print "xxx",lurl if page == None: print "ERROR:", url else: handler = get_site_handler(url, parser) it, urls = handler.parse(url, page) if it != None: if it.cname == None or it.cname == "": it.cname = urllist[2] it.ename = urllist[3] it.date = urllist[4] it.mid = urllist[5] it.imdbid = urllist[6] if it.quality == "": it.quality = urllist[7] it.raw = urllist[1] detaillist.append(it) for ur in urls: u = ur.strip('/') if "douban" in u:
continue elif "imdb" in url: if url not in imdburlmap: #print "get imdb", url imdburlmap[url]=1 continue try: if "http://" in url: print "craw link",url page=utils.crawl_timeout(url,15,3) # print "xxx",lurl if page ==None: print "ERROR:",url else: handler = get_site_handler(url,parser) it,urls = handler.parse(url,page) if it!=None: if it.cname ==None or it.cname =="": it.cname = urllist[2] it.ename = urllist[3] it.date = urllist[4] it.mid = urllist[5] it.imdbid = urllist[6] if it.quality =="": it.quality = urllist[7] it.raw = urllist[1] detaillist.append(it) for ur in urls: u = ur.strip('/') if "douban" in u:
if __name__ == "__main__": try: mmap = {} urlfile = sys.argv[2] output_url = sys.argv[3] # output_link = sys.argv[3] # pic_dir = sys.argv[4] parser = parse.Parser() parser.init(sys.argv[1]) mlist = [] for line in open(urlfile,'r'): flist = line.strip().split('\t') url = flist[0] quality = flist[1] print url handler = site_handler.get_site_handler(url,parser) page = utils.crawl_timeout(url,15,3) if page !=None: cclist = handler.dir_parse(url,page) for m in cclist: m.quality = quality mlist.extend(cclist) for m in mlist: print m.url print m.raw print m.cname print m.ename havelist = Link.objects.filter(urlmd5__in=[utils.get_md5_value(it.url) for it in mlist]) linkmap = { i.url:i for i in havelist} linklist = []