def doDownloadJob(jsonToDict, itemname): global nrOfDownloads try: base_url = jsonToDict['base_url']#becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists=False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists=True except: contentKeyExists=False pass #Get the time the json-file was made timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only changes that are no older than #the date of start of current process! process_start_date = comm.makeDateObj(ajadir) json_model_date = comm.makeDateObj(timeDir) #continueonly if #date in model is younger or equal to a #date of a process start if(contentKeyExists) & (json_model_date >= process_start_date): #excel type is already downloaded if("excel" not in jsonToDict[fname_key][csha]['Content-Type']): #full URL of a file file_url = jsonToDict[fname_key][csha]['file_url'] dirPath = comm.downloadsDir + base_url + "/" try: #create folder for this 'date/base_url' if does not exist if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass
def doDownloadJob(jsonToDict, itemname): global nrOfDownloads try: base_url = jsonToDict['base_url'] #becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url' ): #fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists = False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists = True except: contentKeyExists = False pass #Get the time the json-file was made timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only changes that are no older than #the date of start of current process! process_start_date = comm.makeDateObj(ajadir) json_model_date = comm.makeDateObj(timeDir) #continueonly if #date in model is younger or equal to a #date of a process start if (contentKeyExists) & (json_model_date >= process_start_date): #excel type is already downloaded if ("excel" not in jsonToDict[fname_key][csha] ['Content-Type']): #full URL of a file file_url = jsonToDict[fname_key][csha]['file_url'] dirPath = comm.downloadsDir + base_url + "/" try: #create folder for this 'date/base_url' if does not exist if (not os.path.isdir(dirPath)) & ( not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 except: comm.printException( comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException( comm.pathToSaveDownloadErrors, itemname) pass except: comm.printException(comm.pathToSaveDownloadErrors, itemname) pass
def dowloadFromJsons(ajadir): nrOfDownloads = 0 jsons = comm.jsonsDir_local """loop over every file in jsons-folder""" for filePath in listdir(jsons): #open json file: #'jsons' is a folder where json-files are saved #'filePath' is a filename in this folder ##'jsons'-folder lives in the folder "datadownload" ##'downloaded_files'-folder lives also in the folder "datadownload" try: """load json-file into directory-type""" jsonToDict = json.load(open(jsons + filePath)) except: continue #'base_url' is the hostname, before "/"-slashes, in json-file #'base_url' is the json-file name ('filePath'), followed by an extension '.json' #'base_url' is also a directory name in 'downloaded_files'-folder base_url = jsonToDict['base_url'] #becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url' ): #fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists = False contentType = "" """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists = True contentType = jsonToDict[fname_key][csha][ 'Content-Type'] except: contentKeyExists = False pass #download only fresh changes! process_start_date = comm.makeDateObj(ajadir) json_model_date = comm.makeDateObj( jsonToDict[fname_key][csha]['timeDir']) if (contentKeyExists) & (json_model_date >= process_start_date): #excel and pdf types are already downloaded if (("excel" not in contentType) & ("pdf" not in contentType)): """Full URL of a file""" file_url = jsonToDict[fname_key][csha]['file_url'] """Get the time the json-file was made""" #timeDir = jsonToDict[fname_key][csha]['timeDir'] """'dirPath' is the path of a folder of a file currently wants to be downloaded""" dirPath = comm.downloadsDir + base_url + "/" try: """create folder for this 'date/base_url' if does not exist""" if (not os.path.isdir(dirPath)) & ( not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 #print(timeDir, base_url, , file_url) except: comm.printException( comm.updateErrorsFilePath, filePath) pass except: comm.printException(comm.updateErrorsFilePath, filePath) pass return nrOfDownloads
def dowloadFromJsons(ajadir): nrOfDownloads = 0 jsons=comm.jsonsDir_local """loop over every file in jsons-folder""" for filePath in listdir(jsons): #open json file: #'jsons' is a folder where json-files are saved #'filePath' is a filename in this folder ##'jsons'-folder lives in the folder "datadownload" ##'downloaded_files'-folder lives also in the folder "datadownload" try: """load json-file into directory-type""" jsonToDict = json.load(open(jsons+filePath)); except: continue #'base_url' is the hostname, before "/"-slashes, in json-file #'base_url' is the json-file name ('filePath'), followed by an extension '.json' #'base_url' is also a directory name in 'downloaded_files'-folder base_url = jsonToDict['base_url']#becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists=False contentType="" """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists=True contentType = jsonToDict[fname_key][csha]['Content-Type'] except: contentKeyExists=False pass #download only fresh changes! process_start_date = comm.makeDateObj(ajadir) json_model_date = comm.makeDateObj(jsonToDict[fname_key][csha]['timeDir']) if(contentKeyExists) & (json_model_date >= process_start_date): #excel and pdf types are already downloaded if(("excel" not in contentType) & ("pdf" not in contentType)): """Full URL of a file""" file_url = jsonToDict[fname_key][csha]['file_url'] """Get the time the json-file was made""" #timeDir = jsonToDict[fname_key][csha]['timeDir'] """'dirPath' is the path of a folder of a file currently wants to be downloaded""" dirPath = comm.downloadsDir + base_url + "/" try: """create folder for this 'date/base_url' if does not exist""" if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 #print(timeDir, base_url, , file_url) except: comm.printException(comm.updateErrorsFilePath, filePath) pass except: comm.printException(comm.updateErrorsFilePath, filePath) pass return nrOfDownloads