def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 # row = worksheet.row(curr_row) # print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if cell_type == 1: sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
for fname in rdfFnames: # rdf URL in worker machine # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/<fname>.rdf # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/LOC.rdf # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/ORG.rdf # /var/www/html/outputf/<ip>/rdf_files/<date_dir>/PER.rdf inputf = remote_date_dir_path + "/" + fname + ".rdf" # create # rdf dirs' paths in master machine: # /var/www/html/rdf_files/<fname>/ # e.g. # /var/www/html/rdf_files/LOC/ # /var/www/html/rdf_files/ORG/ # /var/www/html/rdf_files/PER/ outputDir = comm.pathToRDFdir + fname + "/" if not os.path.isdir(outputDir): # make dirs ORG, PER, LOC in master's dir 'rdf_files' os.makedirs(outputDir) # rdf file path in master machine: # /var/www/html/rdf_files/<fname>/<remoteName>_<date_dir>_.rdf # e.g. # /var/www/html/rdf_files/rdf_files/ORG/worker1_08_2015_09_10.rdf # old was: output = outputDir + remoteName + ".rdf" output = outputDir + remoteName + "_" + date_dir + "_" + ".rdf" try: urr(inputf, output) except: pass mergeRDFfiles()
def dowloadFromJsons(ajadir): nrOfDownloads = 0 jsons=comm.jsonsDir """loop over every file in jsons-folder""" for filePath in listdir(jsons): if(filePath != "errors.txt"): #open json file: #'jsons' is a folder where json-files are saved #'filePath' is a filename in this folder ##'jsons'-folder lives in the folder "datadownload" ##'downloaded_files'-folder lives also in the folder "datadownload" try: """load json-file into directory-type""" jsonToDict = json.load(open(jsons+filePath)); except: continue #'base_url' is the hostname, before "/"-slashes, in json-file #'base_url' is the json-file name ('filePath'), followed by an extension '.json' #'base_url' is also a directory name in 'downloaded_files'-folder base_url = jsonToDict['base_url']#becomes folderName for fname_key in jsonToDict.keys(): #At first level, there are two sorts of keys in json-file: #1. base-url #2. sha(s) of filename(s), (sha of file full url, including "/"-slashes) ##As the file content may change over time, every sha(filename)-element contains ##1. sha(s) of a content(s) ###Every sha of a content contains ###1. metadata of a file/content if (fname_key != 'base_url'):#fname_key(sha of file url) becomes local filename #'jsonToDict[fname_key].keys()' is a list of sha(content) of a current sha(filename) #loop over every sha(content) of a sha(filename) #here, csha is the sha(filecontent) for csha in jsonToDict[fname_key].keys(): contentKeyExists=False """check if metadata contains key 'Content-Type'""" try: if ('Content-Type' in jsonToDict[fname_key][csha]): contentKeyExists=True except: contentKeyExists=False pass """Get the time the json-file was made""" timeDir = jsonToDict[fname_key][csha]['timeDir'] #download only today's changes! if(contentKeyExists) & (ajadir == timeDir): #excel type is already downloaded if("excel" not in jsonToDict[fname_key][csha]['Content-Type']): """Full URL of a file""" file_url = jsonToDict[fname_key][csha]['file_url'] """'dirPath' is the path of a folder of a file currently wants to be downloaded""" dirPath = comm.downloadsDir + base_url + "/" try: """create folder for this 'date/base_url' if does not exist""" if (not os.path.isdir(dirPath)) & (not os.path.exists(dirPath)): os.makedirs(dirPath) try: #download the file into that folder #fname_key is the sha(filename) #resulting path of a file will become 'date/base_url/sha(filename)' urr(file_url, dirPath + fname_key) nrOfDownloads += 1 #print(timeDir, base_url, , file_url) except: comm.printException(comm.pathToSaveDownloadErrors, filePath) pass except: comm.printException(comm.pathToSaveDownloadErrors, filePath) pass return nrOfDownloads
#/var/www/html/outputf/<ip>/rdf_files/<date_dir>/<fname>.rdf #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/LOC.rdf #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/ORG.rdf #/var/www/html/outputf/<ip>/rdf_files/<date_dir>/PER.rdf inputf = remote_date_dir_path + "/" + fname + ".rdf" #create #rdf dirs' paths in master machine: #/var/www/html/rdf_files/<fname>/ #e.g. #/var/www/html/rdf_files/LOC/ #/var/www/html/rdf_files/ORG/ #/var/www/html/rdf_files/PER/ outputDir = comm.pathToRDFdir + fname + "/" if not os.path.isdir( outputDir ): #make dirs ORG, PER, LOC in master's dir 'rdf_files' os.makedirs(outputDir) #rdf file path in master machine: #/var/www/html/rdf_files/<fname>/<remoteName>_<date_dir>_.rdf #e.g. #/var/www/html/rdf_files/rdf_files/ORG/worker1_08_2015_09_10.rdf #old was: output = outputDir + remoteName + ".rdf" output = outputDir + remoteName + "_" + date_dir + "_" + ".rdf" try: urr(inputf, output) except: pass mergeRDFfiles()