예제 #1
0
def detectChanges(jsonFilePath, listOfUrls):
    global nrOfChanges
    global worker_counter
    #print(jsonFilePath)
    #print("--------------------")
    #print(str(len(listOfUrls)))
    #print("--------------------")
    jsonDict = dict()
    #json-files may be incorrectly formed,
    #in that case one cannot load it into dictionary
    isValidJsonFile=True
    try:
        #open json file
        #load file into dictionary-type:
        jsonDict = json.load(open(jsonFilePath))
    except:
        isValidJsonFile=False
        pass
    #print(str(isValidJsonFile))
    #get URL from the current dict
    if(isValidJsonFile):
        #hash is sha224
        for key in jsonDict.keys():
            #if key is currently not a base_url, it is filename
            #under filename-key, there can be among other metadata
            #one or more  content SHAs
            #URL to that file (document) in web
            if(key != 'base_url'):#key is sha of file's URL, elsewhere saved into variable localFilename 
                #structure:
                #sha of filename
                ###sha of file content
                ######metadata of file + (filename , sha(file content), 
                ###### human-readable file url (under key 'file_url'), accessed date) 
                ###sha of file another (updated) content
                ######metadata...
                fileSHAs = list(jsonDict[key].keys())#list of SHA's of file content at time of accessing this file
                arbitrFileSha = fileSHAs[0]#this is only for getting file URL
                fileUrl = jsonDict[key][arbitrFileSha]["file_url"]
                redirectedTo=0
                try:
                    redirectedTo = requests.get(fileUrl).url
                except:
                    comm.printException(comm.updateErrorsFilePath, errString="open_url")
                    continue #continue with next URL in loop
                if(redirectedTo!=0):
                    #print(str(redirectedTo))
                    #read the doc's content at current moment
                    try:
                        pageread = (requests.get(redirectedTo)).text
                    except:
                        comm.printException(comm.updateErrorsFilePath, errString="pageread1")
                        try:
                            pageread = ((requests.get(redirectedTo)).text.encode('utf-8').strip())
                        except Exception as e:
                            comm.printException(comm.updateErrorsFilePath, errString="pageread2")
                            print(e)
                            continue
                    #get hash of this doc
                    fileContentSha224 = (hashlib.sha224(pageread.encode('utf-8')).hexdigest())
                    #check if content is changed meanwhile
                    if(fileContentSha224 not in fileSHAs):#data has changed!!!
                        #collect number of changes
                        nrOfChanges += 1
                        #as a content of this doc has changed, send its URL to the worker
                        #for extracting entities
                        #fill the list of URLs
                        listOfUrls.append(fileUrl)
                        ''''''
                        postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if(len(listOfUrls) == postListSize):
                            #send list of urls to worker
                            worker_counter = postToWorker.detectConnection(ipList, worker_counter, listOfUrls)
                            #empty list of object names
                            #to prepare it for the next worker
                            del listOfUrls[:] 
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter > (len(ipList)-1)):#last worker in the workers list
                                #start over from first worker in the workers list
                                worker_counter = 0
예제 #2
0
def detectChanges(jsonFilePath, listOfUrls):
    global nrOfChanges
    global worker_counter
    #print(jsonFilePath)
    #print("--------------------")
    #print(str(len(listOfUrls)))
    #print("--------------------")
    jsonDict = dict()
    #json-files may be incorrectly formed,
    #in that case one cannot load it into dictionary
    isValidJsonFile = True
    try:
        #open json file
        #load file into dictionary-type:
        jsonDict = json.load(open(jsonFilePath))
    except:
        isValidJsonFile = False
        pass
    #print(str(isValidJsonFile))
    #get URL from the current dict
    if (isValidJsonFile):
        #hash is sha224
        for key in jsonDict.keys():
            #if key is currently not a base_url, it is filename
            #under filename-key, there can be among other metadata
            #one or more  content SHAs
            #URL to that file (document) in web
            if (
                    key != 'base_url'
            ):  #key is sha of file's URL, elsewhere saved into variable localFilename
                #structure:
                #sha of filename
                ###sha of file content
                ######metadata of file + (filename , sha(file content),
                ###### human-readable file url (under key 'file_url'), accessed date)
                ###sha of file another (updated) content
                ######metadata...
                fileSHAs = list(
                    jsonDict[key].keys()
                )  #list of SHA's of file content at time of accessing this file
                arbitrFileSha = fileSHAs[0]  #this is only for getting file URL
                fileUrl = jsonDict[key][arbitrFileSha]["file_url"]
                redirectedTo = 0
                try:
                    redirectedTo = requests.get(fileUrl).url
                except:
                    comm.printException(comm.updateErrorsFilePath,
                                        errString="open_url")
                    continue  #continue with next URL in loop
                if (redirectedTo != 0):
                    #print(str(redirectedTo))
                    #read the doc's content at current moment
                    try:
                        pageread = (requests.get(redirectedTo)).text
                    except:
                        comm.printException(comm.updateErrorsFilePath,
                                            errString="pageread1")
                        try:
                            pageread = ((requests.get(redirectedTo)
                                         ).text.encode('utf-8').strip())
                        except Exception as e:
                            comm.printException(comm.updateErrorsFilePath,
                                                errString="pageread2")
                            print(e)
                            continue
                    #get hash of this doc
                    fileContentSha224 = (hashlib.sha224(
                        pageread.encode('utf-8')).hexdigest())
                    #check if content is changed meanwhile
                    if (fileContentSha224
                            not in fileSHAs):  #data has changed!!!
                        #collect number of changes
                        nrOfChanges += 1
                        #as a content of this doc has changed, send its URL to the worker
                        #for extracting entities
                        #fill the list of URLs
                        listOfUrls.append(fileUrl)
                        ''''''
                        postListSize = postToWorker.defPostListSize(
                            worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if (len(listOfUrls) == postListSize):
                            #send list of urls to worker
                            worker_counter = postToWorker.detectConnection(
                                ipList, worker_counter, listOfUrls)
                            #empty list of object names
                            #to prepare it for the next worker
                            del listOfUrls[:]
                            #prepare next worker
                            worker_counter += 1
                            if (worker_counter >
                                (len(ipList) -
                                 1)):  #last worker in the workers list
                                #start over from first worker in the workers list
                                worker_counter = 0
예제 #3
0
        #avoid double work,
        #post only distinct URLs
        distinct_urls = set()
        with open(mylargefile) as f:
            for line in f:
                nr_of_log_rows += 1  #for statistics
                plineUrl = processline(line)
                if (plineUrl is not None) & (plineUrl != ""):
                    if (plineUrl not in distinct_urls) & (
                            'icomoon' not in plineUrl.lower()) & (
                                'hobekivi' not in plineUrl.lower()):
                        distinct_urls.add(plineUrl)
                        #delete_counter += 1
                        line_counter += 1
                        urlsList.append(plineUrl)
                        postListSize = postToWorker.defPostListSize(
                            worker_counter, ipList_tuple)
                        #send certain amount of URLs to each worker, then empty the list of URLS
                        if (len(urlsList) == postListSize):
                            #post the list until connected to worker is successful
                            worker_counter = postToWorker.detectConnection(
                                ipList, worker_counter, urlsList)

                            #postreq_dir
                            jf = open(comm.postreq_path, 'a')
                            jf.write(
                                time.strftime("%d/%m/%Y_%H:%M:%S") +
                                " just posted to: " +
                                str(ipList[worker_counter]) + "\n")
                            jf.close()

                            del urlsList[:]  #empty list of urls
예제 #4
0
파일: auth.py 프로젝트: Mailis/EstNer
 #delete_counter = 0
 urlsList = []
 #avoid double work,
 #post only distinct URLs
 distinct_urls = set()
 with open(mylargefile) as f:
     for line in f:
         nr_of_log_rows += 1 #for statistics
         plineUrl = processline(line)
         if(plineUrl is not None)&(plineUrl != ""):
             if(plineUrl not in distinct_urls)&('icomoon' not in plineUrl.lower())&('hobekivi' not in plineUrl.lower()):
                 distinct_urls.add(plineUrl)
                 #delete_counter += 1
                 line_counter += 1
                 urlsList.append(plineUrl)
                 postListSize = postToWorker.defPostListSize(worker_counter, ipList_tuple)
                 #send certain amount of URLs to each worker, then empty the list of URLS
                 if(len(urlsList) == postListSize):
                     #post the list until connected to worker is successful
                     worker_counter = postToWorker.detectConnection(ipList, worker_counter, urlsList)
                     
                     #postreq_dir
                     jf = open(comm.postreq_path, 'a')
                     jf.write(time.strftime("%d/%m/%Y_%H:%M:%S") + " just posted to: " + str(ipList[worker_counter]) + "\n")
                     jf.close()
                     
                     
                     del urlsList[:] #empty list of urls
                     #prepare next worker
                     worker_counter += 1
                     if (worker_counter > (len(ipList)-1)):