def rmBindMount(self, sid): ''' unBindMounts and deletes all bind mount points for all Fcombine users for a given SID ''' log(3, "performing rmbindMount(sid) against all BMP's for sid '%s'" % sid) # get server.name of server.sid try: serverName = xsftp.webui.models.Server.objects.get(id=sid).server_name except xsftp.webui.models.Server.DoesNotExist: log(3, "Found an unrecognised SID (%s) in the SLAMMount directory, so can't unBindMount it - skipping." % sid) return # get live bmp's liveBindMountPointsDict = self.getLiveBindMountPoints() # get user list users = liveBindMountPointsDict.keys() # for each user for user in users: # if user has a bmp reflecting specified sid if serverName in liveBindMountPointsDict[user]: # lazy unBindMount the bmp unBindMountCmd = "umount -l /home/" + user + "/xsftp/" + serverName + " > /dev/null 2>&1" rc = 0 while not rc: rc = os.system(unBindMountCmd) # delete the bmp try: os.remove("/home/%s/xsftp/%s/where_are_my_files.txt" % (user, serverName)) except OSError: # file "where_are_my_files.txt" erroniously did not exist, but safe to ignore and continue. pass rmBindMountPoindCmd = "rmdir /home/" + user + "/xsftp/" + serverName + " > /dev/null 2>&1" os.system(rmBindMountPoindCmd) return
def initSLAMMountPoints(self): ''' Creates and removes (cleans up) the SLAM mount point directories in xsftp.common.constants.SERVERDIR based on data from the DB. ''' # Get list of expected SLAM mount points expectedSLAMMountPoints = self.getExpectedSLAMMountPoints() log(4, "Expected SLAM Mount Points are ... %s" % expectedSLAMMountPoints) # Get list of live SLAM mount points liveSLAMMountPoints = self.getLiveSLAMMountPoints() log(4, "Live SMPs are ... %s" % [int(i) for i in liveSLAMMountPoints]) # Create dir's for each sid which doesnt exist in liveSLAMMountPoints for sid in [str(x) for x in expectedSLAMMountPoints]: if not sid in liveSLAMMountPoints: # mount point for this server doesnt exist, create directory: newDir = xsftp.common.constants.SERVERDIR + str(sid) mkdirCmd = "mkdir " + newDir + " > /dev/null 2>&1" os.system(mkdirCmd) # Copy in a "where are my files" text message shutil.copy("%setc/xsftp/where_are_my_files.txt" % xsftp.common.constants.APPDIR, newDir) # Delete dir's for each sid which does not exist in expectedSLAMMountPoints for sid in liveSLAMMountPoints: if not sid in [str(x) for x in expectedSLAMMountPoints]: # lazy un-Bind-Mount and delete the server's associated bind mount points self.bmp_manager.rmBindMount(sid) # lazy un-mount the SMP #unmountSLAMMountPointCmd = "fusermount -uz " + xsftp.common.constants.SERVERDIR + sid + " > /dev/null 2>&1" unmountSLAMMountPointCmd = "umount -l " + xsftp.common.constants.SERVERDIR + sid + " > /dev/null 2>&1" rc = 0 while not rc: rc = os.system(unmountSLAMMountPointCmd) # delete the SLAM mount point rmSshfsMountPointCmd = "rm -f " + xsftp.common.constants.SERVERDIR + sid + "/where_are_my_files.txt; rmdir " + xsftp.common.constants.SERVERDIR + sid + " > /dev/null 2>&1" os.system(rmSshfsMountPointCmd) return
def doBindMount(self, bmpAbsPath): ''' Atempts to bind-mount a server referenced by the specified bmpAbsPath. If server is already bind mounted, we just return successfully. This function references the specified server's record in the Django for mount parameters. ''' # get server's name name = bmpAbsPath.split("/")[-1] # get user's name user = bmpAbsPath.split("/")[2] # check if already bind mounted liveBindMounts = self.getLiveBindMounts() if liveBindMounts.has_key(user) and name in liveBindMounts[user]: log (4, "BMP %s is already bind mounted, skipping doBindMount." % bmpAbsPath) return # perform the bind mount # get sid of server sid = xsftp.webui.models.Server.objects.get(server_name=name).id # get ismbAbsPath smbAbsPath = "%s%s" % (xsftp.common.constants.SERVERDIR, sid) # log attempt log(4, "Bind mounting %s to %s" % (smbAbsPath, bmpAbsPath)) bindMountCmd = "mount --bind %s %s > /dev/null 2>&1" % (smbAbsPath, bmpAbsPath) os.system(bindMountCmd) return
def sendEmailAlert(self, sid, state, time_first_seen_in_new_state): ''' Sends an email to everyone in the global serverlink_alert_groups about the specified server-link's health problem. sid = sid of server which is unhealthy (int) state = int time = time first seen in this state (secs since epoc) ''' recipients = [] recipient_groups = xsftp.webui.models.Configuration.objects.all()[0].serverlink_alert_groups.all() for group in recipient_groups: for user in group.users.all(): if user not in recipients: recipients.append(user) email_addresses = [user.email for user in recipients if user.email] server_link = xsftp.webui.models.Server.objects.get(id=sid) server_link_name = server_link.server_name device_name = xsftp.webui.models.Configuration.objects.all()[0].device_name if not email_addresses: log(1, "Could not send Server Link Health warning email for Server '%s': No 'Server Link Health Global Alert Groups' have been specified." % server_link_name) # instantiate a new Server object, set its state, then extract its html details for that state. if server_link.status != state: server_link.status = state # generate text details, by converting the html healthstrings to text for email rendering. myWriter = formatter.DumbWriter() myFormatter = formatter.AbstractFormatter(myWriter) p = EmailTextParser(myFormatter) p.feed(server_link.healthStrings()) # remove tab characters details = p.data.replace('\t','') # remove blank lines details = "\n".join([line for line in details.split("\n") if line != '']) p.close() #details = server_link.healthStrings() # generate time string total_seconds = int(time.time() - time_first_seen_in_new_state) days = total_seconds / 86400 hours = total_seconds % 86400 / 3600 minutes = total_seconds % 86400 % 3600 / 60 seconds = total_seconds % 86400 % 3600 % 60 time_string = "%s days, %s hours, %s minutes, %s seconds" % (days, hours, minutes, seconds) message = ''' This is an automatic message from the Fcombine Device: %(device_name)s The Server Link '%(server_link_name)s' has been in unhealthy state %(state)s for %(time_string)s. arning - Jobs and Users may not be able to utilise this Server Link until it is repaired. See details below for help on remediating this issue. Details are: %(details)s ''' % {"device_name":device_name, "server_link_name":server_link_name, "state":state, "time_string":time_string, "details":details} try: email.send_email(subject="Fcombine Server Link Health warning for Server '%s'" % server_link_name, body=message, to=email_addresses) except Email_Error, e: log(1, "Could not send Server Link Health warning email for Server '%s': %s" % (server_link_name, e))
def setStatus(self, bmpAbsPath, state): """ Sets status values of a specified BMP in the serverStatusDict dictionary, which is of the form: {BMP : ('sid', state, timeSinceEpocFirstSeenInCurrentState, timeSinceEpocLastSeenHealthy)} Takes in two arguments: bmpAbsPath (string) is the BMP's absolute path as a string state (int) is the state value to assign to the specified BMP * If BMP has never been healthy, timeSinceEpocLastSeenHealthy value will be set to the time the daemon started. """ # if there is no entry in the serverStatusDict for this bmp, and it's state is initially not zero (ie. it is unhealthy) then set this value to -1 self.shared_vars.serverStatusDictLock.acquire() # if BMP has no entry in the serverStatusDict, set timeSinceLastHealthy value to the current time. if not self.shared_vars.serverStatusDict.has_key(bmpAbsPath): timeSinceLastHealthy = int(time.time()) else: # assign it's new timeSinceLastHealthy value to that which it was previously. timeSinceLastHealthy = self.shared_vars.serverStatusDict[bmpAbsPath][3] self.shared_vars.serverStatusDictLock.release() # if the status value for the BMP given in this functions second argumen't is 0 (healthy) if state == 0: # set the timeSinceLastHealthy to now: timeSinceLastHealthy = int(time.time()) # save the values to the tuples in the serverStatusDict dictionary. self.shared_vars.serverStatusDictLock.acquire() # if dicrt does not yet contain status for this bmp, or status has changed if ( not self.shared_vars.serverStatusDict.has_key(bmpAbsPath) or self.shared_vars.serverStatusDict[bmpAbsPath][1] != state ): # we have detected a status change, set timeFirstSeenInCurrentState timeFirstSeenInCurrentState = int(time.time()) log(3, "State Change: Found " + bmpAbsPath + " in state " + str(state)) log( 2, "State Change: Found Server Link '" + os.path.basename(bmpAbsPath) + "' in %s state " % ["unhealthy", "healthy"][state == 0] + str(state), ) else: # preserve timeFirstSeenInCurrentState timeFirstSeenInCurrentState = self.shared_vars.serverStatusDict[bmpAbsPath][2] # save the values sid = xsftp.webui.models.Server.objects.get(server_name=bmpAbsPath.split("/")[-1]).id self.shared_vars.serverStatusDict[bmpAbsPath] = (sid, state, timeFirstSeenInCurrentState, timeSinceLastHealthy) self.shared_vars.serverStatusDictLock.release() # if state is healthy, remove this bmp's entry in the global alertTracekr dict self.shared_vars.alertTrackerLock.acquire() if state == 0 and sid in self.shared_vars.alertTracker: self.shared_vars.alertTracker.pop(sid) self.shared_vars.alertTrackerLock.release()
def unBindMount(self, bmpAbsPath): ''' Performs a lazy unmount on specified bmpAbsPath ''' log(4,"Doing unBindMount('%s')..." % bmpAbsPath) name = bmpAbsPath.split("/")[-1] if name not in self.getLiveBindMountList(): log (5, "BMP %s is not bind mounted, skipping unBindMount." % bmpAbsPath) return unBindMountCmd = "umount -l %s > /dev/null 2>&1" % bmpAbsPath os.system(unBindMountCmd) return
def get_key_fingerprint(self, address, port, write_log=True): # get the key fingerprint from the known_hosts file if str(port) == '22': kh_key = address else: kh_key = "[%s]:%s" % (address, port) fingerPrint = None try: fingerPrint = hexlify(paramiko.util.load_host_keys(os.path.expanduser(xsftp.common.constants.KNOWN_HOSTS_FILE))[kh_key]['ssh-rsa'].get_fingerprint()) except IOError: if write_log: log(4, "fingerprint check for endpoint server at address '%s' failed: Known hosts file does not exist!" % address) except KeyError: if write_log: log(4, "fingerprint check for endpoint server at address '%s' failed: Address not found in known hosts file." % address) return fingerPrint
def initAllBindMounts(self): ''' # This function will bind mount all BMP's which are not already bind mounted. # We assume all required BMP's exist ''' # get live bmp's liveBindMountPointDict = self.getLiveBindMountPoints() # extract users from live bmp's users = liveBindMountPointDict.keys() #for each user for user in users: # get user's bmp's bindMountPoints = liveBindMountPointDict[user] # for each of the user's bmp's for bindMountPoint in bindMountPoints: # assemble the bmpAbsPath bmpAbsPath = "/home/%s/xsftp/%s" % (user, bindMountPoint) # log bind mount attempt log(4, "Bindmounting %s" % (bmpAbsPath)) # do the bind mount. self.doBindMount(bmpAbsPath) return
def run(self): while True: # get current self.shared_vars.serverStatusDict self.shared_vars.serverStatusDictLock.acquire() currentServerStatusDict = self.shared_vars.serverStatusDict.copy() # get current bmp's self.shared_vars.serverStatusDictLock.release() bmpList = currentServerStatusDict.keys() # for each bmp for bmp in bmpList: # log its state if not healthy if currentServerStatusDict[bmp][1]: log(5, "connectorWorkerThread %s reports: - BMP %s has been in STATE %s for %s seconds." % (self.getName(), bmp, currentServerStatusDict[bmp][1], (int(time.time()) - currentServerStatusDict[bmp][2]))) log(3, "Server Link '%s' has been in unhealthy state %s for %s seconds" %( os.path.basename(bmp), currentServerStatusDict[bmp][1], (int(time.time()) - currentServerStatusDict[bmp][2]))) # if server is unhealthy for over config.REPAIR_DELAY seconds, if currentServerStatusDict[bmp][1] != 0 and (int(time.time()) - currentServerStatusDict[bmp][3]) > config.REPAIR_DELAY: # first, log how long this bmp has been unhealthy for, and low long it has been since it was last healthy. log(3, "BMP Requires repair: BMP %s has been in current unhealthy state %s for over %s seconds, and has been unhealthy for %s seconds." % (bmp, currentServerStatusDict[bmp][1], (int(time.time()) - currentServerStatusDict[bmp][2]), (int(time.time()) - currentServerStatusDict[bmp][3])) ) # Check if this bindMountPoint is having its status determined self.shared_vars.statChecksInProgressLock.acquire() if bmp in self.shared_vars.statChecksInProgress: self.shared_vars.statChecksInProgressLock.release() log(3, "Not spawning a repair for BMP %s - Reason: BMP is currently having its status checked" % bmp) continue self.shared_vars.statChecksInProgressLock.release() # Check if this allegedly damaged BMP is being or has been worked on: self.shared_vars.serverRepairInProgressLock.acquire() if self.shared_vars.serverRepairInProgress.count((bmp, False)): # this BMP is currently being repaired, so do not spawn another rmediator thread for it. log(3, "Not spawning a repair for BMP %s - Reason: Repair job already underway..." % bmp) elif self.shared_vars.serverRepairInProgress.count((bmp, True)): # an attempt at repairing this BMP has been completed, waiting for the statusWorkerThread to check its status log(3, "Not spawning a repair for BMP %s - Reason: Repaired but check still pending..." % bmp) else: # add this bmp to the remediation job queue and spawn a remediator thread for it # The job queue contains a tuple, the 1st value is the bmp to be repaired, and the second is a bool which is False if not yet repaired. # Once repaired, the remediator thread will change this value to true, then a statWorkerThread will remove jobs from job queue which have 2nd value == True. self.shared_vars.serverRepairInProgress.append((bmp, False)) remediatorThread = RemediatorWorkerThread(self.shared_vars, self.slam_manager, self.bmp_manager, bmp) remediatorThread.start() log(3, "BMP %s added to repair queue." % bmp) log(3, "Attempting repair of Server Link: %s" % os.path.basename(bmp)) self.shared_vars.serverRepairInProgressLock.release() # if server has been unhealthy for config.ALERT_DELAY mins, then pass to alert subsystem if currentServerStatusDict[bmp][1] != 0 and (int(time.time()) - currentServerStatusDict[bmp][3]) > config.ALERT_DELAY: # alert the specified people of a problem log(3, "BMP %s has been unhealthy for over %s mins. Activating alert subsystem" % ( bmp, ((int(time.time()) - currentServerStatusDict[bmp][3]))/60 ) ) log(2, "Server link %s has been unhealthy for over %s mins. Activating alert subsystem" % ( os.path.basename(bmp), ((int(time.time()) - currentServerStatusDict[bmp][3]))/60 ) ) # fire up the alerting subsystem self.raiseEmailAlert(bmp) time.sleep(self.CWT_SLEEP)
def run(self): status = -999 # catch all, but it will get overwritten. # get the server object referred to in the specified bmpAbsPath server = xsftp.webui.models.Server.objects.get(server_name=self.bmpAbsPath.split("/")[-1]) # derive the smpAbsPath of the specified bmpAbsPath smpAbsPath = xsftp.common.constants.SERVERDIR + str(server.id) try: # stat the bmp to see how it is bmpStat = os.stat(self.bmpAbsPath) # returns a stat object # if we get here without invoking an exception ... # stat the associated smpAbsPath to see how it is smpStat = os.stat(smpAbsPath) # stat the root partition (this should never fail, but catch it in any case rootStat = os.stat(xsftp.common.constants.SERVERDIR) # If we get here, then all stat commands have completed SUCCESSFULLY # Now do some comparisons # if the bmp device is the same as the smp device, and different to the root device: if bmpStat.st_dev == smpStat.st_dev != rootStat.st_dev: # everything is OK status = self.MPSTATE_OK # elif the bmp device is the same as the smp device and the same as the root device: elif bmpStat.st_dev == smpStat.st_dev == rootStat.st_dev: # the SLAM mount does not exist. Try find out why. s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(self.STWT_SOCKET_TIMEOUT) if server.type == "sftp": log(4, "Testing SFTP-specific health for BMP: %s" % self.bmpAbsPath) s.connect( (server.address, server.port) ) # this can throw several errors that we handle below, which divulge status info t = paramiko.Transport(s) t.connect() # this will throw if something other than an SSH daemon is listening on the remote port (banner error) because it happens pre-auth. remoteFingerPrint = t.get_remote_server_key().get_fingerprint() try: localFingerPrint = paramiko.util.load_host_keys( os.path.expanduser(xsftp.common.constants.KNOWN_HOSTS_FILE) )[server.address][ "ssh-rsa" ].get_fingerprint() # raises IOError if file doesnt exist or KeyError if server not a known_host yet except: localFingerPrint = None # if keys dont match... if localFingerPrint and not localFingerPrint == remoteFingerPrint: # The remote fingerprint has changed! could be man-in-the-middle, etc. status = self.MPSTATE_KEY_MISMATCH else: # the fingerprints are fine, continue the checks key = paramiko.DSSKey.from_private_key_file( str(config.KEYFILE) ) # could raise "IOError: [Errno 2] No such file or directory:" in event of missing key t.auth_publickey(server.remote_user, key) # can throw several errors which divulge status info # establish a client session to the endpoint to stat the specified remote_dir and ensure it exists c = t.open_sftp_client() st = c.stat(server.remote_path) # this raises IOError: [Errno 2] No such file on error # if the specified remote path isnt a directory if not stat.S_ISDIR(st.st_mode): # then the specified remote path is invalid status = self.MPSTATE_BAD_REMOTE_PATH else: # if we get this far, then there is nothing wrong with the SSH layer and below (physical, etc), so to remediate we can initialise the SMP. status = self.MPSTATE_SM_BROKEN elif server.type == "cifs": log(4, "Testing CIFS-specific health for BMP: %s" % self.bmpAbsPath) s.connect((server.address, server.cifs_port)) # if we get here, the target is listening and allowing connections on the specified port. # test for CIFS related errors s = SMBClient.SMBClient( server.address, server.cifs_port, server.cifs_share, username=server.remote_user, password=server.cifs_password, ) if server.remote_path and not s.is_dir(str(server.remote_path)): status = self.MPSTATE_BAD_REMOTE_PATH else: # if we get here, the cifs stuff looks good, set status to the CIFS catch all status = self.MPSTATE_CIFS_ERROR s.close() elif server.type == "ftp": log(4, "Testing FTP-specific health for BMP: %s" % self.bmpAbsPath) f = FTPClient.FTP( server.address, port=server.ftp_port, passive=server.ftp_passive, user=str(server.remote_user), passwd=str(server.ftp_password), ssl=server.ftp_ssl, ssl_implicit=server.ftp_ssl_implicit, ) f.login() f.retrlines("LIST", callback=lambda msg: None) f.cwd(str(server.remote_path)) # elif the bmp device is different to the smp device which is in turn different to the root device elif bmpStat.st_dev != smpStat.st_dev != rootStat.st_dev: # The SSHFS mount is correct, and the bindmount isn't status = self.MPSTATE_BM_BROKEN # elif the bmp device is different to the smp device which is in turn the same as the root device elif bmpStat.st_dev != smpStat.st_dev == rootStat.st_dev: # both the BM and SM are broken status = self.MPSTATE_BM_AND_SM_BROKEN else: # The catch-the-rest log(1, "Unexpected Server Link error, resetting Server Link %s:'%s'." % (server.server_name, server.id)) status = self.MPSTATE_ERROR1 # Catch exceptions for the above checks except FTPClient.error_wrong_service, e: status = self.MPSTATE_WRONG_SERVICE
status = self.MPSTATE_FTP_DATA_CHANNEL_ERROR except FTPClient.error_bad_credentials, e: status = self.MPSTATE_AUTH_FAILED except FTPClient.error_ftps_not_supported, e: status = self.MPSTATE_FTP_FTPS_NOT_SUPPORTED except FTPClient.error_ftpes_not_supported, e: status = self.MPSTATE_FTP_FTPES_NOT_SUPPORTED except FTPClient.error_bad_remote_path, e: status = self.MPSTATE_BAD_REMOTE_PATH except FTPClient.error_ftpes_required, e: status = self.MPSTATE_FTP_FTPES_REQUIRED except FTPClient.Error, FTPClientExceptionText: status = self.MPSTATE_FTP_ERROR log( 2, "Server Link '%s' (type FTP) in unhealthy state MPSTATE_FTP_ERROR, error message is: %s" % (server.server_name, FTPClientExceptionText), ) except SMBClient.SMBClientException, e: e = str(e) if e == "bad share name": status = self.MPSTATE_CIFS_BAD_SHARE_NAME elif e == "bad credentials": status = self.MPSTATE_AUTH_FAILED elif e == "wrong service": status = self.MPSTATE_WRONG_SERVICE else: log(1, "CIFS health error: %s" % e) status = self.MPSTATE_CIFS_ERROR except socket.gaierror, e: if e[0] == -2:
def run(self): # get the current state of this BMP self.shared_vars.serverStatusDictLock.acquire() currentServerStatus = self.shared_vars.serverStatusDict[self.bmpAbsPath] self.shared_vars.serverStatusDictLock.release() state = currentServerStatus[1] # setup a shorthand reference to the statWorkerThread class for state name references below swt = StatWorkerThread # attempt to remediate this server based on its state if state == swt.MPSTATE_OK: # this bmp is in state 0: Healthy. Do nothing, purge this job from the self.shared_vars.serverRepairInProgress global job queue and return # this condition should never happen, but process it just incase it does - maybe a bmp transitions from unhealthy to healthy in the split second it takes for this thread to fire up for example... log(3, "Finished repair attempt: BMP=%s State=0:MPSTATE_OK (nothing to do, as it was healthy on arrival)" % self.bmpAbsPath) return elif state in [ swt.MPSTATE_BM_BROKEN, swt.MPSTATE_SM_BROKEN, swt.MPSTATE_BM_AND_SM_BROKEN, swt.MPSTATE_BM_UNREATTACHED, swt.MPSTATE_NO_ROUTE_TO_HOST, swt.MPSTATE_CONNECTION_REFUSED, swt.MPSTATE_CONNECTION_TIMEOUT, swt.MPSTATE_KEY_MISMATCH, swt.MPSTATE_KEYFILE_MISSING, swt.MPSTATE_WRONG_SERVICE, swt.MPSTATE_PUBLIC_KEY_NOT_ALLOWED, swt.MPSTATE_AUTH_FAILED, swt.MPSTATE_KEY_REQUIRES_PASSPHRASE, swt.MPSTATE_BAD_REMOTE_PATH, swt.MPSTATE_SOCKET_ERROR, swt.MPSTATE_CIFS_BAD_SHARE_NAME, swt.MPSTATE_CIFS_ERROR, swt.MPSTATE_FTP_DATA_CHANNEL_ERROR, swt.MPSTATE_FTP_FTPS_NOT_SUPPORTED, swt.MPSTATE_FTP_FTPES_NOT_SUPPORTED, swt.MPSTATE_FTP_FTPES_REQUIRED, ]: # FIX:(1) BRING UP THE SMP (IF NECESSARY), RIP DOWN THE BINDMOUNT (IF IT EXISTS), AND BRING UP THE BIND MOUNT self.slam_manager.doSLAMMount(self.sid) self.bmp_manager.unBindMount(self.bmpAbsPath) self.bmp_manager.doBindMount(self.bmpAbsPath) log(3, "Finished repair attempt: BMP=%s State=%s" % (self.bmpAbsPath, state)) elif state in [ swt.MPSTATE_SM_DISCONNECTED_AND_BM_BROKEN, swt.MPSTATE_SM_DISCONNECTED,]: # FIX:(2) (WAIT FOR THE CONNECTION TO BE RE-ESTABLISHED NATURALLY - SSHFS.C / mount.cifs / curlftpfs WILL FIX IT) log(3, "BMP %s has been in state %s (DISCONNECTED) for %s seconds - awaiting self-heal" % (self.bmpAbsPath, state, (int(time.time()) - currentServerStatus[2]) ) ) elif state == swt.MPSTATE_BMP_DOESNT_EXIST: # FIX:(3) REINIT ALL BMP'S, AND BRING UP THE BIND MOUNT self.bmp_manager.initBindMountPoints() self.bmp_manager.doBindMount(self.bmpAbsPath) log(3, "Finished repair attempt: BMP=%s State=%s:MPSTATE_BMP_DOESNT_EXIST" % (self.bmpAbsPath, state) ) elif state == swt.MPSTATE_SMP_DOESNT_EXIST: # FIX:(5) INIT ALL SMP'S, THEN RUN FIX(1) self.slam_manager.initSLAMMountPoints() self.slam_manager.doSLAMMount(self.sid) self.bmp_manager.unBindMount(self.bmpAbsPath) self.bmp_manager.doBindMount(self.bmpAbsPath) log(3, "Finished repair attempt: BMP=%s State=%s:MPSTATE_SMP_DOESNT_EXIST" % (self.bmpAbsPath, state) ) else: # state will be swt.MPSTATE_ERROR1, swt.MPSTATE_ERROR2, swt.MPSTATE_ERROR3, swt.MPSTATE_ERROR4, swt.MPSTATE_FTP_ERROR, or -10 (server link unused), etc... # FIX:(4) CATCHALL. PULL DOWN ALL BM'S FOR THIS SM, PULL DOWN SM, BRING UP SM, BRING UP ALL BM'S FOR THIS SM self.slam_manager.unSLAMMount(self.sid) self.slam_manager.doSLAMMount(self.sid) self.initBindMountsLock.acquire() self.bmp_manager.initAllBindMounts() self.initBindMountsLock.release() # Log repair attempt. log(3, "Finished repair attempt: BMP=%s State=%s:MPSTATE_ERROR%s" % (self.bmpAbsPath, state, state) ) # Mark BMP's entry in the repairInProgress job queue as completed. self.shared_vars.serverRepairInProgressLock.acquire() self.shared_vars.serverRepairInProgress.remove((self.bmpAbsPath, False)) self.shared_vars.serverRepairInProgress.append((self.bmpAbsPath, True)) self.shared_vars.serverRepairInProgressLock.release() # remove BMP's entry in the self.shared_vars.pendingRepair queue self.shared_vars.pendingRepairLock.acquire() self.shared_vars.pendingRepair.pop(self.bmpAbsPath) self.shared_vars.pendingRepairLock.release() log(6, "Repair worker removed bmp %s from self.shared_vars.pendingRepair queue" % self.bmpAbsPath) log(3, "Completed repair attempt for Server Link: %s" % os.path.basename(self.bmpAbsPath)) return
def run(self): ''' Each iteration of this perpetual loop will profile every BMP and update the self.shared_vars.serverStatusDict with its status. It will also clear any jobs marked as completed from the remediation thread's job queue. ''' while True: #log(6, "Number of objects = %s" % len(gc.get_objects())) self.initAllMountPoints() # cleanup self.shared_vars.serverStatusDict of any entries that should not ne in there (failsafe against memory leaks on self.shared_vars.serverStatusDict) exptectdBMPList = self.bmp_manager.getExpectedBindMountPoints(bmpabspath=True) self.shared_vars.serverStatusDictLock.acquire() currentServerStatusDict = self.shared_vars.serverStatusDict.copy() for key in currentServerStatusDict.keys(): if key not in exptectdBMPList: log(6, "removing bmpAbsPath: '%s' from the self.shared_vars.serverStatusDict as it no longer requires health checking." % key) self.shared_vars.serverStatusDict.pop(key) self.shared_vars.serverStatusDictLock.release() # profile each BMP for its status. # get all expected BMP's: bindMountPointsDict = self.bmp_manager.getExpectedBindMountPoints() # for each user's BMP for user in bindMountPointsDict.keys(): for bindMountPoint in bindMountPointsDict[user]: # set bmpAbsPath bmpAbsPath = "/home/%s/xsftp/%s" % (user, bindMountPoint) # if this bmp's status is still being worked on by a statWorkerThread (spawned by a previous iteration of this loop in this thread) self.shared_vars.statChecksInProgressLock.acquire() if self.shared_vars.statChecksInProgress.__contains__(bmpAbsPath): self.shared_vars.statChecksInProgressLock.release() log(5, "%s's status is still being profiled, skipping this status update iteration." % bmpAbsPath) # then skip attempt to update status again continue else: # if it is pending repair self.shared_vars.pendingRepairLock.acquire() if bmpAbsPath in self.shared_vars.pendingRepair.keys(): log(5, "%s's is marked as repair pending, skipping this status update iteration." % bmpAbsPath) self.shared_vars.pendingRepairLock.release() self.shared_vars.statChecksInProgressLock.release() # then skip attempt to update status until repair is done continue # otherwise, check its status self.shared_vars.pendingRepairLock.release() self.shared_vars.statChecksInProgress.append(bmpAbsPath) # spawn a statWorkerThread for this BMP to determine status statThread = StatWorkerThread(self.shared_vars, bmpAbsPath) statThread.start() self.shared_vars.statChecksInProgressLock.release() # write to the logs a table of statuses for each bmp. self.shared_vars.serverStatusDictLock.acquire() for key in self.shared_vars.serverStatusDict.keys(): log(6, "*** self.shared_vars.serverStatusDict entry: %s : %s" % (key, str(self.shared_vars.serverStatusDict[key]))) # Now update the Django database with the latest info from the ServerStatusDict # ConsolidatedServerStatusDict - a dictionary of server_names:(state, timeFirstSeenInCurrentState, timeLastSeenHealthy) consolidatedServerStatusDict = dict() # For each bind_mount in the ServerStatusDict for bmp in self.shared_vars.serverStatusDict.keys(): # Get the associated server and other details (sid, currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy) = self.shared_vars.serverStatusDict[bmp] # If the Server doesn't exist in ConsolidatedServerStatusDict if sid not in consolidatedServerStatusDict.keys(): # Then add all the details in to ConsolidatedServerStatusDict from the current consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy) # Elif this bind_mount is healthy and existing_entry is healthy: elif currentState == 0 and consolidatedServerStatusDict[sid][0] == 0: # use the values from the one with the oldest (lowest) timeFirstInCurrentState if timeFirstSeenInCurrentState < consolidatedServerStatusDict[sid][1]: consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy) # Elif this bind mount is unhealthy and the existing entry is healthy: elif currentState != 0 and consolidatedServerStatusDict[sid][0] == 0: # Then add all the details in to ConsolidatedServerStatusDict from the current bind_mount consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy) # Elif this bind mount is unhealthy: elif currentState != 0: # use the values from the one with the newest (highest) timeFirstSeenInCurrentState value if timeLastSeenHealthy > consolidatedServerStatusDict[sid][1]: consolidatedServerStatusDict[sid] = (currentState, timeFirstSeenInCurrentState, timeLastSeenHealthy) self.shared_vars.serverStatusDictLock.release() # For each server in ConsolidatedServerStatusDict: for sid in consolidatedServerStatusDict.keys(): # print sid, consolidatedServerStatusDict[sid] # Grab the server from the database server = xsftp.webui.models.Server.objects.get(id=sid) # Update its details server.status = consolidatedServerStatusDict[sid][0] server.timeFirstSeenInCurrentState = datetime.datetime.fromtimestamp(consolidatedServerStatusDict[sid][1]) server.timeLastSeenHealthy = datetime.datetime.fromtimestamp(consolidatedServerStatusDict[sid][2]) server.time_last_checked = datetime.datetime.now() # Save the server server.save(synchronise=False) # Check that all servers which have a key_fingerprint value have an equivalent entry in KNOWN_HOSTS, if not, nullify their key_fingerprint in the DB. f = file(xsftp.common.constants.KNOWN_HOSTS_FILE, 'r') f.close() knownHostAddresses = [host.split(':')[0].replace("[","").replace("]","") for host in paramiko.util.load_host_keys(os.path.expanduser(xsftp.common.constants.KNOWN_HOSTS_FILE)).keys()] for server in [serverObj for serverObj in xsftp.webui.models.Server.objects.all() if serverObj.key_fingerprint]: if server.address not in knownHostAddresses: server.key_fingerprint = None server.save(synchronise=False) #add fingerprints to sftp-type server objs that dont have one in django but do have one in known_hosts for server in [serverObj for serverObj in xsftp.webui.models.Server.objects.all() if serverObj.type == "sftp" and not serverObj.key_fingerprint]: # get the key fingerprint from the known_hosts file fingerPrint = self.get_key_fingerprint(server.address, server.port, write_log=False) if fingerPrint: server.key_fingerprint = fingerPrint server.save(synchronise=False) # Check for jobs that look like they are running (ie pid != None), and check that there is the associated process for it. If not, clean up the job's attributes jobs = xsftp.webui.models.Job.objects.all() running_jobs = list() for job in jobs: if job.pid: # append running job running_jobs.append(job) else: # job isn't running, ensure its running_now value is sane (must not be None (terminating..) or True (running now)) if job.running_now != False: job.running_now = False job.save() for job in running_jobs: try: # we call getpgid, and if the process doesn't exist, an Exception is raised os.getpgid(job.pid) except OSError: # The process doesn't exist, so clean up the job job.running_now = False job.pid = None job.save() # now sleep for set time time.sleep(self.SWT_SLEEP)
def doSLAMMount(self, sid): ''' Atempts to mount a server referenced by the specified sid. If server is already sshfs mounted, we just return successfully. ''' # check if sshfs mount is already being worked on self.SLAMMountsInProgressLock.acquire() log(6, "Acquired self.SLAMMountsInProgressLock") if sid not in self.SLAMMountsInProgress.keys(): log(5, "No other threads are working on this Server Link %s, I will assume responsiblity." % sid) self.SLAMMountsInProgress[sid] = threading.Condition(self.SLAMMountsInProgressLock) log(6, "About to release the self.SLAMMountsInProgressLock") self.SLAMMountsInProgressLock.release() else: log(5, "Server Link %s is already being worked on - waiting for it to be fixed" % sid) self.SLAMMountsInProgress[sid].wait() log(5, "This thread got woken up - Server Link %s has been marked as fixed (and waiting for check)" % sid) self.SLAMMountsInProgressLock.release() return # This next bit checks whether the sshfs mount is already mounted, which can happen if some other thread fixed just before we did. # Additionally, while we were getting here, a few threads on our tail may have already come in and joined the wait queue, # so we need to wake them up and then we can all bail out of here. if str(sid) in self.slam_manager.getLiveSLAMMounts(): log (5, "Server Link %s is already mounted, skipping." % sid) self.SLAMMountsInProgressLock.acquire() condition = self.SLAMMountsInProgress.pop(sid) condition.notifyAll() self.SLAMMountsInProgressLock.release() return ################################### ### SERVER LINK MOUNTING BEGINS ### # get server object referenced by sid in argument serverObj = xsftp.webui.models.Server.objects.get(id=sid) # ================ # SSHFS MOUNT CODE # ================ if serverObj.type == 'sftp': # get server_name server_name = serverObj.server_name # get GID for this server's linux write group gid = str(grp.getgrnam("x_%s" % sid)[2]) # get server's address address = serverObj.address # get port number port = int(serverObj.port) # get keyfile location key = serverObj.key_file # get remoteuser remoteuser = serverObj.remote_user # get remote path remotepath = serverObj.remote_path log(4, "Mounting %s (type: sftp): SID=%s ADDRESS=%s PORT=%s writeGroupName=x_%s GID=%s KEY=%s USERNAME=%s REMOTE_PATH=%s" % (server_name, sid, address, port, sid, gid, key, remoteuser, remotepath)) # if this server's address is NOT in the known_hosts file, or the file doesnt exist, add the StrictHostKeyChecking=no option to suppress interactive yes/no ssh confirmation doStrictKeyCheck = False for host in paramiko.util.load_host_keys(xsftp.common.constants.KNOWN_HOSTS_FILE).keys(): components = host.split(':') host_name = components[0].replace("[","").replace("]","") if len(components) == 1: port = 22 elif len(components) == 2: port = int(components[1]) if server.address == host_name and server.port == port: log(4, "Performing strict key check for server link %s since I found its matching hostname '%s:%s' in known_hosts" % (server_name, address, port)) doStrictKeyCheck = True if doStrictKeyCheck == True: mountCmd = "sshfs -o UserKnownHostsFile=%s,StrictHostKeyChecking=yes,compression=yes,cache=no,default_permissions,uid=0,gid=%s,umask=002,nonempty,reconnect,allow_other,IdentityFile=%s,ServerAliveInterval=3,port=%s %s@%s:'%s' %s%s > /dev/null 2>&1" % (xsftp.common.constants.KNOWN_HOSTS_FILE, gid, key, port, remoteuser, address, remotepath, xsftp.common.constants.SERVERDIR, sid) log(6, "sshfsmount command is: %s" % mountCmd) else: log (4, "omitting strict key check for server link %s since I could not find a matching hostname '%s' in known_hosts" % (server_name, address)) mountCmd = "sshfs -o UserKnownHostsFile=%s,StrictHostKeyChecking=no,compression=yes,cache=no,default_permissions,uid=0,gid=%s,umask=002,nonempty,reconnect,allow_other,IdentityFile=%s,ServerAliveInterval=3,port=%s %s@%s:'%s' %s%s > /dev/null 2>&1" % (xsftp.common.constants.KNOWN_HOSTS_FILE, gid, key, port, remoteuser, address, remotepath, xsftp.common.constants.SERVERDIR, sid) log(6, "sshfsmount command is: %s" % mountCmd) result = os.system(mountCmd) if result: # log failed sshfs mount attempt log(2, "Server Link (type: sftp) establishment attempt for server '%s' failed. Return code was %s" % (server_name, result)) else: # log successful sshfs mount attempt log(4, "Success: sshfs mount to %s:%s established." % (address, port)) log(1, "Server Link '%s' successfully established." % server_name) # get the key fingerprint from the known_hosts file fingerPrint = self.get_key_fingerprint(address, port) # save fingerprint to django models.Server if serverObj.key_fingerprint != fingerPrint: log(4, "Got new/different fingerprint %s for Server Link '%s'" % (fingerPrint, server_name)) serverObj.key_fingerprint = fingerPrint serverObj.save(synchronise=False) # =============== # CIFS MOUNT CODE # =============== elif serverObj.type == 'cifs': argDict = { 'sid':serverObj.id, 'name':serverObj.server_name, 'address': serverObj.address, 'cifs_port':serverObj.cifs_port, 'cifs_share': serverObj.cifs_share, 'remote_path': serverObj.remote_path, 'mount_point': "%s%s" % (xsftp.common.constants.SERVERDIR, sid), 'remote_user': serverObj.remote_user, 'cifs_password': serverObj.cifs_password, 'gid': str(grp.getgrnam("x_%s" % serverObj.id)[2]) } mountCmd = "/sbin/mount.cifs //%(address)s/'%(cifs_share)s'/'%(remote_path)s' %(mount_point)s -o user='******',pass='******',uid=0,gid=%(gid)s,rw,dir_mode=0775,file_mode=0775,port=%(cifs_port)s > /dev/null 2>&1" % argDict # If the specified remote path points to a file instead of a dir, the mount command will still work and the mount point will appear as that file. Ensure this does not happen. remote_path_ok = True SMBClientExceptionText = "unknown" try: s = SMBClient.SMBClient(serverObj.address, serverObj.cifs_port, serverObj.cifs_share, username=serverObj.remote_user, password=serverObj.cifs_password) if serverObj.remote_path: remote_path_ok = s.is_dir(str(serverObj.remote_path)) SMBClientExceptionText = "bad remote path" except Exception, SMBClientExceptionText: remote_path_ok = False try: s.close() except: pass if remote_path_ok: log(4, "Mounting %(name)s (type: cifs): SID=%(sid)s ADDRESS=%(address)s PORT=%(cifs_port)s GID=%(gid)s USERNAME=%(remote_user)s SHARE_NAME=%(cifs_share)s REMOTE_PATH=%(remote_path)s" % argDict) log(6, "cifs mount command is: %s" % mountCmd.replace("pass='******'" % serverObj.cifs_password, "pass=<HIDDEN>")) p = subprocess.Popen(mountCmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) rc = os.waitpid(p.pid, 0)[1] if rc: log(2, "Server Link (type: cifs) establishment attempt for server '%s' failed. Return code was %s" % (serverObj.server_name, rc)) else: log(2, "Server Link '%s':'%s' (type: cifs) successfully established." % (serverObj.address, serverObj.server_name)) else: log(2, 'Server Link (type: cifs) "%s" failed pre-checks (Error: %s), skipping establishment.' % (serverObj.server_name, SMBClientExceptionText))
id = serverObj.id # perform pre-checks do_ftp_mount = True log(6,"performing FTP pre-checks for server link %s" % server_name) try: f = FTPClient.FTP(address, port=ftp_port, passive=ftp_passive, user=remote_user, passwd=ftp_password, ssl=ftp_ssl, ssl_implicit=ftp_ssl_implicit) f.login() f.retrlines('LIST', callback=lambda msg: None) f.cwd(remote_path) except Exception, FTPClientExceptionText: do_ftp_mount = False try: f.quit() except: pass # perform actual ftp mount if do_ftp_mount: log(6,"performing FTP mount for server link %s" % server_name) # XXX note the use of the -f switch to curlftpfs below, which forces it not to daemonize and instead run in the foreground. If we don't do this, then for some reason some FTPES mounts (to Win2k8 IIS servers) won't work (the mount appears to work and the underlying FTP session is successfully established but trying to open the mountpoint for reading produces an IOError). Investigate this, nothing that we use our own slighly customized curlftpfs - check our RPM build dir for the source and patches. mountCmd = "curlftpfs -f -o transform_symlinks,connect_timeout=5,allow_other,default_permissions,uid=0,umask=002,nonempty,cache=no,ftp_timeout=10" if ftp_ssl: mountCmd += ",ssl,no_verify_peer,no_verify_hostname" if not ftp_passive: mountCmd += ",ftp_port=-,disable_epsv" ftp_credentials = ",user='******'" % (remote_user, ftp_password) #.replace(":",r"\:")) mountCmd += ftp_credentials mountCmd += ",gid=%s" % str(grp.getgrnam("x_%s" % id)[2]) if ftp_ssl and ftp_ssl_implicit: mountCmd += " ftps://" else: mountCmd += " ftp://" mountCmd += "%s:%s" % (address, ftp_port) if serverObj.remote_path: