def parseDailyMovie(dl): try: page = urllib2.urlopen(dl.link) content = page.read() content = content.decode('gb18030').encode('utf8') page.close() except URLError: raise index = content.find("<div id=\"content\">")+18 index2 = content.find("</div>",index) content = content[index:index2] movies = content.split("<br />\r\n<br />\r\n") mi = 1 p = re.compile('<IMG class="postimg" src=".*" />',re.IGNORECASE); pl = re.compile('<A href=".*" target=_blank >\*\*\*\*\*點此下載\*\*\*\*\*</A>',re.IGNORECASE); for movie in movies: #logger.info("movie "+str(mi)+" :\n" + movie) #logger.info("movie "+str(mi)+"***************************************************************") mi = mi + 1 movie = movie.strip() if len(movie) < 20: continue #create the movielink object digestkey = hashlib.sha224(movie).hexdigest() tIndex = movie.find("<br />") mTitle = movie[0:tIndex] #find all images images = [] for match in p.finditer(movie): image = str(match.group()) iIndex = image.find('src="')+5 iIndex2 = image.find('"',iIndex) image = image[iIndex:iIndex2] #logger.info("movie: "+mTitle+" image:"+image) images.append(image) imagesLink = ";".join(images) dls = [] for match in pl.finditer(movie): dlink = str(match.group()) iIndex = dlink.find('href="')+6 iIndex2 = dlink.find('"',iIndex) dlink = dlink[iIndex:iIndex2] dls.append(dlink) dlLinks = ";".join(dls) result = MovieLink.objects.filter(digestkey=digestkey) if len(result) == 0: ml = MovieLink(title = mTitle,raw_desc = movie,digestkey = digestkey,daily_link=dl,images=imagesLink,downloadlink=dlLinks) ml.save() else: logger.info("movie already existed:...." + mTitle) dl.parsed = True dl.save()
def parserDaily(): unparsed_dailys = DailyLink.objects.filter(parsed=False) #unparsed_dailys = DailyLink.objects.filter(id__gte=1, id__lt=10) #unparsed_dailys = DailyLink.objects.all() for daily in unparsed_dailys: HTML = '' if daily.raw_desc: HTML = daily.raw_desc else: HTML = getHTML(daily.link) daily.raw_desc = HTML if HTML: dcp = DailyCollectionParser() dcp.feed(HTML) for movie in dcp.all_movies: desc = '' digestkey = '' title = '' if movie.desc: title = movie.desc[0] desc = '\r\n'.join(movie.desc) desc = desc.strip() #import chardet #print chardet.detect(desc) #digestkey = hashlib.sha256(desc.decode('utf-8')).hexdigest() images = ';'.join(movie.imgs) downloadlink = ';'.join(movie.links) digestkey = hashlib.sha256(downloadlink).hexdigest() result = MovieLink.objects.filter(digestkey=digestkey) if not len(result): # didn't exists same movie. ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink) ml.save() ''' try: ml = MovieLink(title=title, raw_desc=desc, digestkey=digestkey, daily_link=daily, images=images, downloadlink=downloadlink) ml.save() except Exception, e: print '[%s]%s' % (title, str(e)) exit(1) ''' daily.parsed = True daily.save()