예제 #1
0
 def save_xhtml(self, xhtml_d):
     assert os.path.isdir(self.save_path)
     ret_signal = True 
     self.file_counter += 1
     try:
         try:
             file = self.save_path + str(xhtml_d['netloc']) + "." + str(self.file_counter) + ".html"
             f = os.open( file, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
         except Exception as e:
             #Stop the Process First and then raise the Exception with some extra info
             self.kill_evt.set()
             raise Exception("SCSpider error while Creating file - Error: %s" % e)
         #Place a file-object wrapper around the file Descriptor
         fobj = os.fdopen(f, "w", 1)       
         #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
         #if xhtml_d['charset']:
         #    fenc = codecs.EncodedFile(fobj, xhtml_d['charset'])
         #else:
         fenc = fobj
     except Exception as e:
         print("SCSpider error while Creating file - Error: %s" % e)
         #Return None for the Spider to know that some error occurred for deciding what to do with it 
         ret_signal = None 
     else:
         try:
             fenc.write( xhtml_d['xhtml_s'] ) # Write the source to file
         except Exception as e:
             print("SCSpider Error while Writing file - Error: %s" % e)
             ret_signal = None
     finally:
         fenc.close()
         
     return ret_signal
예제 #2
0
 def __load_dict(self, filename=None):
     #Create a temp dictionary of the seen URLs in 'file'
     seen_dict = dict()  
     try:
         try:
             f = os.open( self.filespath + filename, os.O_RDONLY)
         except Exception as e:
             print("DUE Unit: Error while Opening file - Error: %s" % e)
             #Return None instead of Dictionary
             seen_dict = None                 
         #Place a file-object wrapper around the file Descriptor
         fobj = os.fdopen(f, "r", 1)       
         #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
         fenc = codecs.EncodedFile(fobj,'utf-8')
         for fileline in fenc:
             #Remove Whitespace characters before giving it as key value into seen_dict
             url = fileline.rstrip()
             seen_dict[ url ] = True
     except Exception as e:
         print("DUE Unit: Exception occurred while loading file - Error: %s" % e)
         #Notify Spider that Something went wrong - Return None instead of Dictionary 
         seen_dict = None
     finally:
         #close file in any case
         fenc.close()
     #return the Dictionary   
     return seen_dict
예제 #3
0
 def savetofile(self, filename=None, file_headers=True):
     """savetofile(): Stores the whole hash-url dictionary on hard disk. 
         This function is recommended to be used externally from a process monitoring and handles the DUEUnit when 
         the crawler lacks of main memory. Currently the number of dictionary records are recommended to be used as criterion"""
     if not filename:
         filename =  str( self.base_url['netloc'] ) + "." + str( len(self.filelist) ) + ".seenurls"
     try:
         try:
             f = os.open( self.filespath + filename, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
         except Exception as e:
             print("DUE Unit: Error while Creating file - Error: %s" % e)
             ret_signal = None 
         #Place a file-object wrapper around the file Descriptor
         fobj = os.fdopen(f, "w", 1)       
         #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
         fenc = codecs.EncodedFile(fobj,'utf-8')
     except Exception as e:
         print("DUE Unit: Error while Saving file - Error: %s" % e)
         #Return None for the Spider to know that some error occurred for deciding what to do with it 
         ret_signal = None 
     else:
         if file_headers:
             header = "BASE URL: " + str( self.base_url['netloc'] ) + "/\n"
             #print header
             fenc.write(header) #heaser.encode()
             #print header
         lines = [ url for url in self.seen.keys() ]
         for line in lines:
             #os.write(f, line)
             fenc.write( str(line) + "\n" ) # Write a string to a file #line.encode()
         #Adding the new file name in the file list
         self.filelist.append(str(filename))
         #Clears the seen dictionary
         self.seen.clear()
         #Return True for the Spider to know that everything went OK
         ret_signal = True
     finally:
         fenc.close()
         
     return ret_signal