def save_xhtml(self, xhtml_d): assert os.path.isdir(self.save_path) ret_signal = True self.file_counter += 1 try: try: file = self.save_path + str(xhtml_d['netloc']) + "." + str(self.file_counter) + ".html" f = os.open( file, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) except Exception as e: #Stop the Process First and then raise the Exception with some extra info self.kill_evt.set() raise Exception("SCSpider error while Creating file - Error: %s" % e) #Place a file-object wrapper around the file Descriptor fobj = os.fdopen(f, "w", 1) #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding #if xhtml_d['charset']: # fenc = codecs.EncodedFile(fobj, xhtml_d['charset']) #else: fenc = fobj except Exception as e: print("SCSpider error while Creating file - Error: %s" % e) #Return None for the Spider to know that some error occurred for deciding what to do with it ret_signal = None else: try: fenc.write( xhtml_d['xhtml_s'] ) # Write the source to file except Exception as e: print("SCSpider Error while Writing file - Error: %s" % e) ret_signal = None finally: fenc.close() return ret_signal
def __load_dict(self, filename=None): #Create a temp dictionary of the seen URLs in 'file' seen_dict = dict() try: try: f = os.open( self.filespath + filename, os.O_RDONLY) except Exception as e: print("DUE Unit: Error while Opening file - Error: %s" % e) #Return None instead of Dictionary seen_dict = None #Place a file-object wrapper around the file Descriptor fobj = os.fdopen(f, "r", 1) #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding fenc = codecs.EncodedFile(fobj,'utf-8') for fileline in fenc: #Remove Whitespace characters before giving it as key value into seen_dict url = fileline.rstrip() seen_dict[ url ] = True except Exception as e: print("DUE Unit: Exception occurred while loading file - Error: %s" % e) #Notify Spider that Something went wrong - Return None instead of Dictionary seen_dict = None finally: #close file in any case fenc.close() #return the Dictionary return seen_dict
def savetofile(self, filename=None, file_headers=True): """savetofile(): Stores the whole hash-url dictionary on hard disk. This function is recommended to be used externally from a process monitoring and handles the DUEUnit when the crawler lacks of main memory. Currently the number of dictionary records are recommended to be used as criterion""" if not filename: filename = str( self.base_url['netloc'] ) + "." + str( len(self.filelist) ) + ".seenurls" try: try: f = os.open( self.filespath + filename, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) except Exception as e: print("DUE Unit: Error while Creating file - Error: %s" % e) ret_signal = None #Place a file-object wrapper around the file Descriptor fobj = os.fdopen(f, "w", 1) #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding fenc = codecs.EncodedFile(fobj,'utf-8') except Exception as e: print("DUE Unit: Error while Saving file - Error: %s" % e) #Return None for the Spider to know that some error occurred for deciding what to do with it ret_signal = None else: if file_headers: header = "BASE URL: " + str( self.base_url['netloc'] ) + "/\n" #print header fenc.write(header) #heaser.encode() #print header lines = [ url for url in self.seen.keys() ] for line in lines: #os.write(f, line) fenc.write( str(line) + "\n" ) # Write a string to a file #line.encode() #Adding the new file name in the file list self.filelist.append(str(filename)) #Clears the seen dictionary self.seen.clear() #Return True for the Spider to know that everything went OK ret_signal = True finally: fenc.close() return ret_signal