def detectChanges(jsonFilePath, listOfUrls): global nrOfChanges global worker_counter #print(jsonFilePath) #print("--------------------") #print(str(len(listOfUrls))) #print("--------------------") jsonDict = dict() #json-files may be incorrectly formed, #in that case one cannot load it into dictionary isValidJsonFile=True try: #open json file #load file into dictionary-type: jsonDict = json.load(open(jsonFilePath)) except: isValidJsonFile=False pass #print(str(isValidJsonFile)) #get URL from the current dict if(isValidJsonFile): #hash is sha224 for key in jsonDict.keys(): #if key is currently not a base_url, it is filename #under filename-key, there can be among other metadata #one or more content SHAs #URL to that file (document) in web if(key != 'base_url'):#key is sha of file's URL, elsewhere saved into variable localFilename #structure: #sha of filename ###sha of file content ######metadata of file + (filename , sha(file content), ###### human-readable file url (under key 'file_url'), accessed date) ###sha of file another (updated) content ######metadata... fileSHAs = list(jsonDict[key].keys())#list of SHA's of file content at time of accessing this file arbitrFileSha = fileSHAs[0]#this is only for getting file URL fileUrl = jsonDict[key][arbitrFileSha]["file_url"] redirectedTo=0 try: redirectedTo = requests.get(fileUrl).url except: comm.printException(comm.updateErrorsFilePath, errString="open_url") continue #continue with next URL in loop if(redirectedTo!=0): #print(str(redirectedTo)) #read the doc's content at current moment try: pageread = (requests.get(redirectedTo)).text except: comm.printException(comm.updateErrorsFilePath, errString="pageread1") try: pageread = ((requests.get(redirectedTo)).text.encode('utf-8').strip()) except Exception as e: comm.printException(comm.updateErrorsFilePath, errString="pageread2") print(e) continue #get hash of this doc fileContentSha224 = (hashlib.sha224(pageread.encode('utf-8')).hexdigest()) #check if content is changed meanwhile if(fileContentSha224 not in fileSHAs):#data has changed!!! #collect number of changes nrOfChanges += 1 #as a content of this doc has changed, send its URL to the worker #for extracting entities #fill the list of URLs listOfUrls.append(fileUrl) '''''' postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple) #send certain amount of URLs to each worker, then empty the list of URLS if(len(listOfUrls) == postListSize): #send list of urls to worker worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls) #empty list of object names #to prepare it for the next worker del listOfUrls[:] #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList)-1)):#last worker in the workers list #start over from first worker in the workers list worker_counter = 0
if(jsonpath is not ""): try: detectChanges(os.path.join(dirpath, fname), listOfUrls) except: comm.printException(comm.updateErrorsFilePath, errString="start_detect") #post rest of the list to the worker if(len(listOfUrls) > 0): #if this was the last worker, use first in workers list, else use next if(worker_counter == (len(ipList)-1)):#last worker was recently used worker_counter = 0 else: worker_counter += 1 #send list of object names to worker worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls) else: comm.printException(comm.updateErrorsFilePath, errString="no_json-files_for_updating") #save finishing time of update process for measuring how much time the process took end = datetime.datetime.now() span = end-start try: jf = open(comm.monthly_updates_path, 'a') jf.write("update-process " + currTime + " " + str(span) + " " + str(nrOfChanges) + " " + str(lenIpList) + " ") jf.close() except: comm.printException(comm.updateErrorsFilePath, errString="update") pass
nr_of_log_rows += 1 #for statistics plineUrl = processline(line) if (plineUrl is not None) & (plineUrl != ""): if (plineUrl not in distinct_urls) & ( 'icomoon' not in plineUrl.lower()) & ( 'hobekivi' not in plineUrl.lower()): distinct_urls.add(plineUrl) #delete_counter += 1 line_counter += 1 urlsList.append(plineUrl) postListSize = postToWorker.defPostListSize( worker_counter, ipList_tuple) #send certain amount of URLs to each worker, then empty the list of URLS if (len(urlsList) == postListSize): #post the list until connected to worker is successful worker_counter = postToWorker.detectConnection( ipList, worker_counter, urlsList) #postreq_dir jf = open(comm.postreq_path, 'a') jf.write( time.strftime("%d/%m/%Y_%H:%M:%S") + " just posted to: " + str(ipList[worker_counter]) + "\n") jf.close() del urlsList[:] #empty list of urls #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList) - 1)): #start over from first worker in list worker_counter = 0
def detectChanges(jsonFilePath, listOfUrls): global nrOfChanges global worker_counter #print(jsonFilePath) #print("--------------------") #print(str(len(listOfUrls))) #print("--------------------") jsonDict = dict() #json-files may be incorrectly formed, #in that case one cannot load it into dictionary isValidJsonFile = True try: #open json file #load file into dictionary-type: jsonDict = json.load(open(jsonFilePath)) except: isValidJsonFile = False pass #print(str(isValidJsonFile)) #get URL from the current dict if (isValidJsonFile): #hash is sha224 for key in jsonDict.keys(): #if key is currently not a base_url, it is filename #under filename-key, there can be among other metadata #one or more content SHAs #URL to that file (document) in web if ( key != 'base_url' ): #key is sha of file's URL, elsewhere saved into variable localFilename #structure: #sha of filename ###sha of file content ######metadata of file + (filename , sha(file content), ###### human-readable file url (under key 'file_url'), accessed date) ###sha of file another (updated) content ######metadata... fileSHAs = list( jsonDict[key].keys() ) #list of SHA's of file content at time of accessing this file arbitrFileSha = fileSHAs[0] #this is only for getting file URL fileUrl = jsonDict[key][arbitrFileSha]["file_url"] redirectedTo = 0 try: redirectedTo = requests.get(fileUrl).url except: comm.printException(comm.updateErrorsFilePath, errString="open_url") continue #continue with next URL in loop if (redirectedTo != 0): #print(str(redirectedTo)) #read the doc's content at current moment try: pageread = (requests.get(redirectedTo)).text except: comm.printException(comm.updateErrorsFilePath, errString="pageread1") try: pageread = ((requests.get(redirectedTo) ).text.encode('utf-8').strip()) except Exception as e: comm.printException(comm.updateErrorsFilePath, errString="pageread2") print(e) continue #get hash of this doc fileContentSha224 = (hashlib.sha224( pageread.encode('utf-8')).hexdigest()) #check if content is changed meanwhile if (fileContentSha224 not in fileSHAs): #data has changed!!! #collect number of changes nrOfChanges += 1 #as a content of this doc has changed, send its URL to the worker #for extracting entities #fill the list of URLs listOfUrls.append(fileUrl) '''''' postListSize = postToWorker.defPostListSize( worker_counter, ipList_tuple) #send certain amount of URLs to each worker, then empty the list of URLS if (len(listOfUrls) == postListSize): #send list of urls to worker worker_counter = postToWorker.detectConnection( ipList, worker_counter, listOfUrls) #empty list of object names #to prepare it for the next worker del listOfUrls[:] #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList) - 1)): #last worker in the workers list #start over from first worker in the workers list worker_counter = 0
listOfUrls) except: comm.printException(comm.updateErrorsFilePath, errString="start_detect") #post rest of the list to the worker if (len(listOfUrls) > 0): #if this was the last worker, use first in workers list, else use next if (worker_counter == (len(ipList) - 1)): #last worker was recently used worker_counter = 0 else: worker_counter += 1 #send list of object names to worker worker_counter = postToWorker.detectConnection( ipList, worker_counter, listOfUrls) else: comm.printException(comm.updateErrorsFilePath, errString="no_json-files_for_updating") #save finishing time of update process for measuring how much time the process took end = datetime.datetime.now() span = end - start try: jf = open(comm.monthly_updates_path, 'a') jf.write("update-process " + currTime + " " + str(span) + " " + str(nrOfChanges) + " " + str(lenIpList) + " ") jf.close() except: comm.printException(comm.updateErrorsFilePath,
distinct_urls = set() with open(mylargefile) as f: for line in f: nr_of_log_rows += 1 #for statistics plineUrl = processline(line) if(plineUrl is not None)&(plineUrl != ""): if(plineUrl not in distinct_urls)&('icomoon' not in plineUrl.lower())&('hobekivi' not in plineUrl.lower()): distinct_urls.add(plineUrl) #delete_counter += 1 line_counter += 1 urlsList.append(plineUrl) postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple) #send certain amount of URLs to each worker, then empty the list of URLS if(len(urlsList) == postListSize): #post the list until connected to worker is successful worker_counter = postToWorker.detectConnection(ipList, worker_counter, urlsList) #postreq_dir jf = open(comm.postreq_path, 'a') jf.write(time.strftime("%d/%m/%Y_%H:%M:%S") + " just posted to: " + str(ipList[worker_counter]) + "\n") jf.close() del urlsList[:] #empty list of urls #prepare next worker worker_counter += 1 if (worker_counter > (len(ipList)-1)): #start over from first worker in list worker_counter = 0 #dont let memory to grow too buzy if (len(distinct_urls) > 1000):