def init(): status,data,update_date=cron.check() if status: #download index changed dic=parseIndex(data)#fetch json of data if dic is None: sys.exit(0) updateFiles(dic) stats={} stats['lastUpdated']=update_date.strip() stats["files"]={} for key in dic: if key in config.getConfig("tracking_files"): stats["files"][key]=dic[key].strip() try: del stats["files"]["../"] #remove parent index except KeyError: pass try: open("stats.json","w").write(json.dumps(stats, indent=4, sort_keys=True)) log.put("Stats updated","SUCCESS") except: log.put("Cannot update stats","FAIL") log.headPut("Finished cron-dbpedia","SUCCESS") else: #No change exit silently sys.exit(0)
def loadStats(): global stats try: stats = json.loads(open("stats.json", "r").read()) log.put("Read stats", "SUCCESS") except Exception as e: log.put("Read stats", "FAIL") sys.exit(0)
def loadStats(): global stats try: stats=json.loads(open("stats.json","r").read()) log.put("Read stats","SUCCESS") except Exception as e: log.put("Read stats","FAIL") sys.exit(0)
def parseIndex(data): log.put("Start parsing index","INFO") soup=bs(data) pres=soup.find("pre") dic={} if pres is not None: aas=pres.findAll("a") for aaas in aas: if aaas is not None: dic[aaas.get("href").strip()]=aaas.next_sibling.split("\t")[0].strip() return dic else: log.put("Parsing error no pre tag","ERROR") return None
def dlfile(url): # Open the url try: f = urlopen(url) print "downloading " + url # Open our local file for writing with open(os.path.basename(url), "wb") as local_file: local_file.write(f.read()) log.put("Download success "+url,"SUCCESS") log.put("Starting decompressing","INFO") os.system("bunzip2 -d "+os.path.basename(url)) log.put("File decompressed "+url,"SUCCESS") #handle errors except HTTPError, e: print "HTTP Error:", e.code, url log.put("Download failed "+url,"ERROR")
def downloadPage(): global stats url = config.getConfig("base_url") html_data = None try: response = u.urlopen(url) response_headers = response.info().dict html_data = response.read() log.put("Index page downloaded", "SUCCESS") last_update_date = response_headers["date"].strip() if stats['lastUpdated'] != last_update_date: log.put("New version available", "INFO") return True, html_data, last_update_date else: log.put("New version not available", "INFO") return False, None, None except Exception as e: log.put("Index page failed to download", "FAIL") return False, None, None
def downloadPage(): global stats url=config.getConfig("base_url") html_data=None try: response=u.urlopen(url) response_headers = response.info().dict html_data=response.read() log.put("Index page downloaded","SUCCESS") last_update_date=response_headers["date"].strip() if stats['lastUpdated']!=last_update_date: log.put("New version available","INFO") return True,html_data,last_update_date else: log.put("New version not available","INFO") return False,None,None except Exception as e: log.put("Index page failed to download","FAIL") return False,None,None
def getConfig(key): try: return c[key] except KeyError as e: log.put(key + " not present in config", "WARNING") return None
import json, log, sys c = None try: c = json.loads(open("config.json", "r").read()) log.put("Read config", "SUCCESS") except Exception as e: log.put("Read config", "FAIL") sys.exit(0) def getConfig(key): try: return c[key] except KeyError as e: log.put(key + " not present in config", "WARNING") return None
def getConfig(key): try: return c[key] except KeyError as e: log.put(key+" not present in config","WARNING") return None
import json,log,sys c=None try: c=json.loads(open("config.json","r").read()) log.put("Read config","SUCCESS") except Exception as e: log.put("Read config","FAIL") sys.exit(0) def getConfig(key): try: return c[key] except KeyError as e: log.put(key+" not present in config","WARNING") return None
def updateFiles(dic): log.put("Downloading tracked files","INFO") li=config.getConfig("tracking_files") #list of files to be updated for l in li: try: if(cron.stats["files"][l]!=dic[l]): #file changed dlfile(config.getConfig("base_url")+l) log.put("Parsing "+config.getConfig("base_url")+l,"INFO") nt.parseURI(l.replace(".bz2",""),l.split(".")[0],l.replace(".bz2",""),l.split(".")[0]) log.put("Parsed "+config.getConfig("base_url")+l,"SUCCESS") except KeyError: dlfile(config.getConfig("base_url")+l) log.put("Parsing "+config.getConfig("base_url")+l,"INFO") nt.parseURI(l.replace(".bz2",""),l.split(".")[0]) log.put("Parsed "+config.getConfig("base_url")+l,"SUCCESS") log.put("Tracked files updated","SUCCESS") log.put("Deleting all files from cache","INFO") os.system("rm *.nt") os.system("rm *.ttl") log.put("Files deleted from cache","SUCCESS")
print "downloading " + url # Open our local file for writing with open(os.path.basename(url), "wb") as local_file: local_file.write(f.read()) log.put("Download success "+url,"SUCCESS") log.put("Starting decompressing","INFO") os.system("bunzip2 -d "+os.path.basename(url)) log.put("File decompressed "+url,"SUCCESS") #handle errors except HTTPError, e: print "HTTP Error:", e.code, url log.put("Download failed "+url,"ERROR") except URLError, e: print "URL Error:", e.reason, url log.put("Download failed "+url,"ERROR") def updateFiles(dic): log.put("Downloading tracked files","INFO") li=config.getConfig("tracking_files") #list of files to be updated for l in li: try: if(cron.stats["files"][l]!=dic[l]): #file changed dlfile(config.getConfig("base_url")+l) log.put("Parsing "+config.getConfig("base_url")+l,"INFO") nt.parseURI(l.replace(".bz2",""),l.split(".")[0],l.replace(".bz2",""),l.split(".")[0]) log.put("Parsed "+config.getConfig("base_url")+l,"SUCCESS") except KeyError: dlfile(config.getConfig("base_url")+l) log.put("Parsing "+config.getConfig("base_url")+l,"INFO")