示例#1
0
 def save_xhtml(self, xhtml_d):
     assert os.path.isdir(self.save_path)
     ret_signal = True 
     self.file_counter += 1
     try:
         try:
             file = self.save_path + str(xhtml_d['netloc']) + "." + str(self.file_counter) + ".html"
             f = os.open( file, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
         except Exception as e:
             #Stop the Process First and then raise the Exception with some extra info
             self.kill_evt.set()
             raise Exception("SCSpider error while Creating file - Error: %s" % e)
         #Place a file-object wrapper around the file Descriptor
         fobj = os.fdopen(f, "w", 1)       
         #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
         #if xhtml_d['charset']:
         #    fenc = codecs.EncodedFile(fobj, xhtml_d['charset'])
         #else:
         fenc = fobj
     except Exception as e:
         print("SCSpider error while Creating file - Error: %s" % e)
         #Return None for the Spider to know that some error occurred for deciding what to do with it 
         ret_signal = None 
     else:
         try:
             fenc.write( xhtml_d['xhtml_s'] ) # Write the source to file
         except Exception as e:
             print("SCSpider Error while Writing file - Error: %s" % e)
             ret_signal = None
     finally:
         fenc.close()
         
     return ret_signal
示例#2
0
 def __load_dict(self, filename=None):
     #Create a temp dictionary of the seen URLs in 'file'
     seen_dict = dict()  
     try:
         try:
             f = os.open( self.filespath + filename, os.O_RDONLY)
         except Exception as e:
             print("DUE Unit: Error while Opening file - Error: %s" % e)
             #Return None instead of Dictionary
             seen_dict = None                 
         #Place a file-object wrapper around the file Descriptor
         fobj = os.fdopen(f, "r", 1)       
         #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
         fenc = codecs.EncodedFile(fobj,'utf-8')
         for fileline in fenc:
             #Remove Whitespace characters before giving it as key value into seen_dict
             url = fileline.rstrip()
             seen_dict[ url ] = True
     except Exception as e:
         print("DUE Unit: Exception occurred while loading file - Error: %s" % e)
         #Notify Spider that Something went wrong - Return None instead of Dictionary 
         seen_dict = None
     finally:
         #close file in any case
         fenc.close()
     #return the Dictionary   
     return seen_dict
示例#3
0
    def open_tail(self, path, go_to_end=False):
        tail = Tail.FileTail()
        tail.file_descriptor = os.open(path, os.O_RDONLY | os.O_NONBLOCK)
        tail.path = path

        if go_to_end:
            os.lseek(tail.file_descriptor, 0, os.SEEK_END)

        watch_descriptor = self.watch_manager.add_watch(
            path, INOTIFY_FILE_MASK, proc_fun=self._inotify_file)

        tail.watch_descriptor = watch_descriptor.pop(path)
        return tail
示例#4
0
    def open_tail(self, path, go_to_end=False):
        tail = Tail.FileTail()
        tail.file_descriptor = os.open(path, os.O_RDONLY | os.O_NONBLOCK)
        tail.path = path

        if go_to_end:
            os.lseek(tail.file_descriptor, 0, os.SEEK_END)

        watch_descriptor = self.watch_manager.add_watch(
            path, INOTIFY_FILE_MASK,
            proc_fun=self._inotify_file)

        tail.watch_descriptor = watch_descriptor.pop(path)
        return tail
示例#5
0
 def savetofile(self, filename=None, file_headers=True):
     """savetofile(): Stores the whole hash-url dictionary on hard disk. 
         This function is recommended to be used externally from a process monitoring and handles the DUEUnit when 
         the crawler lacks of main memory. Currently the number of dictionary records are recommended to be used as criterion"""
     if not filename:
         filename =  str( self.base_url['netloc'] ) + "." + str( len(self.filelist) ) + ".seenurls"
     try:
         try:
             f = os.open( self.filespath + filename, os.O_CREAT | os.O_WRONLY, stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
         except Exception as e:
             print("DUE Unit: Error while Creating file - Error: %s" % e)
             ret_signal = None 
         #Place a file-object wrapper around the file Descriptor
         fobj = os.fdopen(f, "w", 1)       
         #Place an Encoding Wrapper to assure the file Writing to be performed with UTF-8 encoding
         fenc = codecs.EncodedFile(fobj,'utf-8')
     except Exception as e:
         print("DUE Unit: Error while Saving file - Error: %s" % e)
         #Return None for the Spider to know that some error occurred for deciding what to do with it 
         ret_signal = None 
     else:
         if file_headers:
             header = "BASE URL: " + str( self.base_url['netloc'] ) + "/\n"
             #print header
             fenc.write(header) #heaser.encode()
             #print header
         lines = [ url for url in self.seen.keys() ]
         for line in lines:
             #os.write(f, line)
             fenc.write( str(line) + "\n" ) # Write a string to a file #line.encode()
         #Adding the new file name in the file list
         self.filelist.append(str(filename))
         #Clears the seen dictionary
         self.seen.clear()
         #Return True for the Spider to know that everything went OK
         ret_signal = True
     finally:
         fenc.close()
         
     return ret_signal
示例#6
0
 else:
     os._exit(0)
 print "(%s) now daemonized" % (os.getpid(), )
 # Close _all_ open (and othewise!) files.
 import resource
 maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
 if maxfd == resource.RLIM_INFINITY:
     maxfd = 4096
 for fdnum in xrange(maxfd):
     try:
         os.close(fdnum)
     except OSError, e:
         if e.errno != errno.EBADF:
             raise
 # Remap std{in,out,err}
 devnull = os.open(os.path.devnull, os.O_RDWR)
 oflags = os.O_WRONLY | os.O_CREAT | os.O_APPEND
 if devnull != 0:  # stdin
     os.dup2(devnull, 0)
 if options.stdout:
     stdout_fd = os.open(options.stdout, oflags)
     if stdout_fd != 1:
         os.dup2(stdout_fd, 1)
         os.close(stdout_fd)
 else:
     os.dup2(devnull, 1)
 if options.stderr:
     stderr_fd = os.open(options.stderr, oflags)
     if stderr_fd != 2:
         os.dup2(stderr_fd, 2)
         os.close(stderr_fd)
示例#7
0
 else:
     os._exit(0)
 print "(%s) now daemonized" % (os.getpid(),)
 # Close _all_ open (and othewise!) files.
 import resource
 maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
 if maxfd == resource.RLIM_INFINITY:
     maxfd = 4096
 for fdnum in xrange(maxfd):
     try:
         os.close(fdnum)
     except OSError, e:
         if e.errno != errno.EBADF:
             raise
 # Remap std{in,out,err}
 devnull = os.open(os.path.devnull, os.O_RDWR)
 oflags = os.O_WRONLY | os.O_CREAT | os.O_APPEND
 if devnull != 0:  # stdin
     os.dup2(devnull, 0)
 if options.stdout:
     stdout_fd = os.open(options.stdout, oflags)
     if stdout_fd != 1:
         os.dup2(stdout_fd, 1)
         os.close(stdout_fd)
 else:
     os.dup2(devnull, 1)
 if options.stderr:
     stderr_fd = os.open(options.stderr, oflags)
     if stderr_fd != 2:
         os.dup2(stderr_fd, 2)
         os.close(stderr_fd)