def update(self,directory,strips=None,user=None,now=DateManip(), all_users=False): self.now = now self.directory = os.path.join(directory,self.now.strftime("%Y-%m-%d"+os.sep)) self.errors = [] c = CalcWeek(self.now) print self.now.strftime("%Y/%m/%d") if not os.path.exists(directory): os.makedirs(directory) htmlout = file(os.path.join(directory,"index.html"),'w') newindex = self.now.strftime("%Y-%m-%d.html") files = os.listdir(directory) files.sort() files.reverse() prev="" for l in files: if l=="index.html" or l==newindex: continue p = os.path.join(directory,l) if os.path.isfile(p): prev = l addfwd = file(p,'r') content = addfwd.read() content = content.replace("<!-- FWD LINK --><br />","<a href=\""+newindex+"\">Next day</a><br />") content = content.replace("<!-- FWD LINK -->","<a href=\""+newindex+"\">Next day</a><br />") addfwd.close() addfwd = file(p,'w+') addfwd.write(content) addfwd.close() break htmlout.write(self.now.strftime("<title>Comics page for %d/%m/%Y</title>\n")) htmlout.write(self.now.strftime("<h3>Comics page for %d/%m/%Y</h3>\n")) if prev!="": htmlout.write("<a href=\""+prev+"\">Previous day</a>") htmlout.write(" <!-- FWD LINK -->") if prev!="": htmlout.write("<br />\n") dirs = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory,x))] dirs.sort() last = self.now.mod_days(-self.maxdays) l = last.strftime("%Y-%m-%d") lastdir = None for d in dirs: if d<l: lastdir = d else: break dirs.reverse() print "dirs",dirs,lastdir for (g,search) in self.get_strips(strips,user, all_users=all_users): print "Running",g.name, "("+g.days+")" last = DateManip(c.get_last_day(g.days)) curr = self.now.copy() found = [] oldstuff = False while curr>=last: folder = os.path.join(directory,curr.strftime("%Y-%m-%d")) if os.path.exists(folder): files = os.listdir(folder) print "Checking",folder for f in files: if f[:len(g.name)] == g.name: found.append(os.path.join(curr.strftime("%Y-%m-%d"),f)) if self.now != curr: oldstuff = True if len(found)!=0: break curr = curr.mod_days(-1) if lastdir!=None: removes = [] found_last = False print "cleanup",dirs for d in dirs: folder = os.path.join(directory,d) files = os.listdir(folder) #print "Checking for deletable",folder for f in files: if f[:len(g.name)] == g.name: if found_last and d<lastdir: print "removing",os.path.join(folder,f) os.unlink(os.path.join(folder,f)) if not found_last: print "found",folder found_last = True if len([x for x in found if not x.endswith("-error")]) == 0: get = [] tried = 0 for s in get_searches(g,search): (kind,data) = s.retr(now) print "kind",kind if kind=="generate": print "Getting (image)",data self.cache.set_varying(data,ref=g.homepage) get = [self.get_url(g.name,data,ref=g.homepage)] else: # kind == "search" if self.debug>=4: print "data",data searchpage = data["searchpage"] assert searchpage != "" print "Getting (searchpage)",searchpage page = self.get_url(g.name,searchpage,ref=searchpage) if page!=None:# and page.status != urlcache.URLCache.STAT_UNCHANGED: content = page.content if data["initialpattern"] != "": print "Initially searching for",data["initialpattern"] iretr = re.findall("(?i)"+data["initialpattern"],content) assert len(iretr) == 1 # other patterns not supported yet content = iretr[0] print "Searching for",data["searchpattern"] assert data["searchpattern"]!="" retr = re.findall("(?i)"+data["searchpattern"],content) if self.debug>=4: print page.content # remove duplicate images/paths dups = set([]) keep = [] for item in retr: if item not in dups: dups.add(item) keep.append(item) retr = keep for x in range(len(retr)): if not s.look.HasField("index") or s.look.index == x+1: r = retr[x] print "Getting (image from search)",urlparse.urljoin(data["baseurl"],r) get.append(self.get_url(g.name,urlparse.urljoin(data["baseurl"],r),ref=searchpage)) tried += 1 else: print "Got no page at all!" get = filter(lambda x:x!=None,get) if get!=[]: break if get!=[]: old = [] files = os.listdir(directory) files.sort() files.reverse() nowfolder = self.now.strftime("%Y-%m-%d") for l in files: if l==nowfolder: continue folder = os.path.join(directory,l) if os.path.isdir(folder): files = os.listdir(folder) print "Looking for old in",folder for f in files: if f[:len(g.name)] == g.name: old.append(os.path.join(folder,f)) if old!=[]: break if old != [] and len(old)==len(get): for o in range(len(old)): print "Comparing",old[o],"and",get[o].url if len(get[o].content) != os.stat(old[o]).st_size: break else: self.store_err(g.name,1,"Got the old stuff in "+ folder) print "" continue index = 0 folder = os.path.join(directory,self.now.strftime("%Y-%m-%d"+os.sep)) for u in get: ext = self.makeext(u, g) index +=1 if not os.path.exists(folder): os.mkdir(folder) fname = folder+g.name+"-"+str(index)+"."+ext outfile = file(fname,mode='wb') try: outfile.write(u.content) outfile.close() except Exception: os.unlink(fname) raise found.append(self.now.strftime("%Y-%m-%d"+os.sep+g.name+"-"+str(index)+"."+ext)) elif tried>0: # if get == [] self.store_err(g.name,2,"Failed to get anything for <a href=\""+g.homepage+"\">"+g.homepage+"</a>") if len(found)>0: print "We found",found if not oldstuff: htmlout.write("<h3><a href=\""+g.homepage+"\">"+g.desc+"</a></h3>\n") onlyerror = len([x for x in found if not x.endswith("error")]) == 0 for f in found: if f.endswith("error"): if onlyerror: htmlout.write(open(os.path.join(directory,f)).read()) continue if Image: try: dimensions = [x*g.zoom for x in Image.open(os.path.join(directory,f)).size] htmlout.write("<img src=\"%s\" width=\"%d\" height=\"%d\"/><br />\n"%(f.replace(os.sep,"/"),dimensions[0],dimensions[1])) continue except: pass # assume something PIL can't cope with htmlout.write("<img src=\"%s\" /><br />\n"%f.replace(os.sep,"/")) htmlout.write("<br />\n") else: self.store_err(g.name,1,"Got the old stuff in "+ folder) print "" self.errors.sort() for (l,e) in self.errors: if l>=self.debug: htmlout.write(str(l)+": "+e+"<br />\n") htmlout.close() for d in dirs: di = os.path.join(directory,d) if os.listdir(di)==[]: os.removedirs(di) if os.path.exists(os.path.join(directory,d+".html")): os.unlink(os.path.join(directory,d+".html")) htmlout = file(os.path.join(directory,"index.html"),'r') dated = file(os.path.join(directory,newindex),'w') dated.write(htmlout.read()) dated.close() htmlout.close() self.cache.cleanup()
def update(self,directory,strips=None,group=None,now=DateManip()): self.now = now self.errors = [] c = CalcWeek(self.now) print self.now.strftime("%Y/%m/%d") if not os.path.exists(directory): os.makedirs(directory) htmlout = file(os.path.join(directory,"index.html"),'w') newindex = self.now.strftime("%Y-%m-%d.html") files = os.listdir(directory) files.sort() files.reverse() prev="" for l in files: if l=="index.html" or l==newindex: continue p = os.path.join(directory,l) if os.path.isfile(p): prev = l addfwd = file(p,'r') content = addfwd.read() content = content.replace("<!-- FWD LINK --><br />","<a href=\""+newindex+"\">Next day</a><br />") content = content.replace("<!-- FWD LINK -->","<a href=\""+newindex+"\">Next day</a><br />") addfwd.close() addfwd = file(p,'w+') addfwd.write(content) addfwd.close() break htmlout.write(self.now.strftime("<title>Comics page for %d/%m/%Y</title>\n")) htmlout.write(self.now.strftime("<h3>Comics page for %d/%m/%Y</h3>\n")) if prev!="": htmlout.write("<a href=\""+prev+"\">Previous day</a>") htmlout.write(" <!-- FWD LINK -->") if prev!="": htmlout.write("<br />\n") dirs = [x for x in os.listdir(directory) if os.path.isdir(os.path.join(directory,x))] dirs.sort() last = self.now.copy() last.mod_days(-self.maxdays) l =last.strftime("%Y-%m-%d") lastdir = None for d in dirs: if d<l: lastdir = d else: break dirs.reverse() print "dirs",dirs,lastdir for g in self.get_strips(strips,group): print "Running",g.entries["name"], "("+g.entries["days"]+")" last = DateManip(c.get_last_day(g.entries["days"])) curr = self.now.copy() found = [] oldstuff = False while curr.compare(last)!=-1: folder = os.path.join(directory,curr.strftime("%Y-%m-%d")) if os.path.exists(folder): files = os.listdir(folder) print "Checking",folder for f in files: if f[:len(g.entries["strip"])] == g.entries["strip"]: found.append(os.path.join(curr.strftime("%Y-%m-%d"),f)) if self.now.compare(curr)!=0: oldstuff = True if len(found)!=0: break else: print "no such folder",folder curr.mod_days(-1) if lastdir!=None: removes = [] found_last = False print "cleanup",dirs for d in dirs: folder = os.path.join(directory,d) files = os.listdir(folder) #print "Checking for deletable",folder for f in files: if f[:len(g.entries["strip"])] == g.entries["strip"]: if found_last and d<lastdir: print "removing",os.path.join(folder,f) os.unlink(os.path.join(folder,f)) if not found_last: print "found",folder found_last = True if len(found)==0: get = [] tried = 0 #print g.get_searches() for s in g.get_searches(self.now): (type,data) = s.retr(now) if type=="generate": print "Getting (image)",data self.cache.set_varying(data,ref=g.entries["homepage"]) get = [self.get_url(g.entries["strip"],data,ref=g.entries["homepage"])] else: # type == "search" if self.debug>=4: print "data",data (pattern,baseurl,searchpage) = data print "Getting (searchpage)",searchpage page = self.get_url(g.entries["strip"],searchpage,ref=searchpage) if page!=None:# and page.status != urlcache.URLCache.STAT_UNCHANGED: print "Searching for",pattern retr = re.findall("(?i)"+pattern,page.content) if self.debug>=4: print page.content # remove duplicate images/paths dups = set([]) keep = [] for item in retr: if item not in dups: dups.add(item) keep.append(item) retr = keep for x in range(len(retr)): if not s.look.has_key("index") or int(s.look["index"]) == x+1: r = retr[x] print "Getting (image from search)",urlparse.urljoin(baseurl,r) get.append(self.get_url(g.entries["strip"],urlparse.urljoin(baseurl,r),ref=searchpage)) tried += 1 else: print "Got no page at all!" get = filter(lambda x:x!=None,get) if get!=[]: break if get!=[]: old = [] files = os.listdir(directory) files.sort() files.reverse() nowfolder = self.now.strftime("%Y-%m-%d") for l in files: if l==nowfolder: continue folder = os.path.join(directory,l) if os.path.isdir(folder): files = os.listdir(folder) print "Looking for old in",folder for f in files: if f[:len(g.entries["strip"])] == g.entries["strip"]: old.append(os.path.join(folder,f)) if old!=[]: break if old != [] and len(old)==len(get): for o in range(len(old)): print "Comparing",old[o],"and",get[o].url if len(get[o].content) != os.stat(old[o]).st_size: break else: self.store_err(g.entries["strip"],1,"Got the old stuff in "+ folder) print "" continue index = 0 folder = os.path.join(directory,self.now.strftime("%Y-%m-%d"+os.sep)) for u in get: if g.entries.has_key("ext"): ext = g.entries["ext"] else: if u.mime[0]!="image" and u.mime[0] !="application": self.store_err(g.entries["strip"],2,"Getting for <a href=\""+g.entries["homepage"]+"\">"+g.entries["homepage"]+"</a> found us a %s/%s (non-image) while retrieving %s"%(u.mime[0],u.mime[1],u.url)) continue raise Exception, str(u.mime) + " isn't an image" if u.mime[1]=='jpeg': ext = 'jpg' elif u.mime[1]=='gif': ext = 'gif' elif u.mime[1]=='png': ext = 'png' elif u.mime[1]=='octet-stream': # somewhat lame ext = 'gif' else: raise Exception, "Don't know extension " + str(u.mime) index +=1 if not os.path.exists(folder): os.mkdir(folder) fname = folder+g.entries["strip"]+"-"+str(index)+"."+ext outfile = file(fname,'wb') outfile.write(u.content) outfile.close() found.append(self.now.strftime("%Y-%m-%d"+os.sep+g.entries["strip"]+"-"+str(index)+"."+ext)) elif tried>0: # if get == [] self.store_err(g.entries["strip"],2,"Failed to get anything for <a href=\""+g.entries["homepage"]+"\">"+g.entries["homepage"]+"</a>") if len(found)>0: print "We found",found if not oldstuff: htmlout.write("<h3><a href=\""+g.entries["homepage"]+"\">"+g.entries["name"]+"</a></h3>\n") for f in found: htmlout.write("<img src=\""+f.replace(os.sep,"/")+"\" /><br />\n") htmlout.write("<br />\n") else: self.store_err(g.entries["strip"],1,"Got the old stuff in "+ folder) print "" self.errors.sort() for (l,e) in self.errors: if l>=self.debug: htmlout.write(str(l)+": "+e+"<br />\n") htmlout.close() for d in dirs: di = os.path.join(directory,d) if os.listdir(di)==[]: os.removedirs(di) if os.path.exists(os.path.join(directory,d+".html")): os.unlink(os.path.join(directory,d+".html")) htmlout = file(os.path.join(directory,"index.html"),'r') dated = file(os.path.join(directory,newindex),'w') dated.write(htmlout.read()) dated.close() htmlout.close() self.cache.cleanup()