def update_last_access_date(): """Refresh "file.last_access_date" for all transfers.""" get_files_pagination__reset() transfers_without_file=0 # transfers in DB but not in FS transfers=get_files_pagination() # loop over block (trick not to load 300000 CTransfer objects in memory..). Size is given by pagination_block_size while len(transfers)>0: for t in transfers: # keep only "done" transfers if t.getStatus() != sdconst.TRANSFER_STATUS_DONE: continue # retrieve "last access date" l__date=None try: l__date=sdutils.get_last_access_date(t) except FileNotFoundException,e: sdlog.error("SDOPERAT-632","File missing on filesystem (%s)"%t.get_full_local_path()) transfers_without_file+=1 continue except SDException,e: sdlog.error("SDOPERAT-532","Fatal error") raise # set new date in DB updatetransferlastaccessdate(l__date,t.file_id)
def process_async_event(): # 'async' is because event are waiting in 'event' table before being proceeded events=sdeventdao.get_events(status=sdconst.EVENT_STATUS_NEW,limit=200) # process 200 events at a time (arbitrary) if len(events)>0: try: sdppproxy.event(events) for e in events: e.status=sdconst.EVENT_STATUS_OLD sdeventdao.update_events(events,commit=False) sddb.conn.commit() sdlog.info("SYNDTASK-001","Events status succesfully updated") except RemoteException,e: # non-fatal sddb.conn.rollback() sdlog.info("SYNDTASK-002","Error occurs during event processing (%s)"%str(e)) except Exception,e: # fatal sddb.conn.rollback() sdlog.error("SYNDTASK-018","Fatal error occurs during event processing (%s)"%str(e)) # debug #sdtrace.log_exception() raise
def event_loop(): global scheduler_state sdlog.info("SDTSCHED-533","Connected to %s"%sdconfig.db_file,stderr=True) scheduler_state=2 start_watchdog() cleanup_running_transfer() scheduler_state=1 sdlogon.renew_certificate(True) sdlog.info("SDTSCHED-902","Transfer daemon is now up and running",stderr=True) while True: assert os.path.isfile(sdconfig.daemon_pid_file) if quit==0: run_soft_tasks() run_hard_tasks() if Download.exception_occurs: sdlog.error("SDTSCHED-002","Fatal exception occured in download thread",stderr=True) break if quit==1: if can_leave(): # wait until all threads finish and until everything has been processed on the database I/O queue sdlog.info("SDTSCHED-001","Synchro data service stopped",stderr=True) break time.sleep(main_loop_sleep) print sdlog.info("SDTSCHED-901","Scheduler successfully stopped",stderr=True)
def update_last_access_date(): """Refresh "file.last_access_date" for all transfers.""" get_files_pagination__reset() transfers_without_file=0 # transfers in DB but not in FS transfers=get_files_pagination() # loop over block (trick not to load 300000 CTransfer objects in memory..). Size is given by pagination_block_size while len(transfers)>0: for t in transfers: # keep only "done" transfers if t.get_status() != sdconst.TRANSFER_STATUS_DONE: continue # retrieve "last access date" l__date=None try: l__date=sdutils.get_last_access_date(t.get_full_local_path()) except FileNotFoundException,e: sdlog.error("SDOPERAT-632","File missing on filesystem (%s)"%t.get_full_local_path()) transfers_without_file+=1 continue except Exception,e: sdlog.error("SDOPERAT-532","Fatal error (%s)"%str(e)) raise # set new date in DB sdfiledao.update_transfer_last_access_date(l__date,t.file_id)
def filter(files): keep=[] reject=[] if len(files)>0: # retrieve type file_=files[0] # 'type' is the same for all files type_=file_['type'] # 'type' itself IS scalar if type_=='File': for f in files: variable=f.get('variable',[]) assert isinstance(variable,list) if len(variable)==1: keep.append(f) else: reject.append(f) if sdconfig.log_domain_inconsistency: sdlog.error("SDPOSXPC-002","'variable' attribute contains too much values (id='%s',variable=%s)."%(f['id'],variable),stderr=sdconfig.print_domain_inconsistency,logger_name=sdconst.LOGGER_DOMAIN) elif type_=='Dataset': # currently, there is no reject rules for Dataset type, so we keep all of them for f in files: keep.append(f) return (keep,reject)
def process_async_event(): # 'async' is because event are waiting in 'event' table before being proceeded events=sdeventdao.get_events(status=sdconst.EVENT_STATUS_NEW,limit=200) # process 200 events at a time (arbitrary) if len(events)>0: try: sdppproxy.event(events) for e in events: e.status=sdconst.EVENT_STATUS_OLD sdeventdao.update_events(events,commit=False) sddb.conn.commit() sdlog.info("SYNDTASK-001","Events status succesfully updated") except RemoteException,e: # non-fatal sddb.conn.rollback() sdlog.info("SYNDTASK-002","Error occurs during event processing (%s)"%str(e)) except Exception,e: # fatal sddb.conn.rollback() sdlog.error("SYNDTASK-018","Fatal error occurs during event processing (%s)"%str(e)) # debug #traceback.print_exc(file=open(sdconfig.stacktrace_log_file,"a")) raise
def immediate_md_delete(tr): """Delete file (metadata only).""" sdlog.info("SDDELETE-080","Delete metadata (%s)"%tr.get_full_local_path()) try: sdfiledao.delete_file(tr,commit=False) except Exception,e: sdlog.error("SDDELETE-128","Error occurs during file metadata suppression (%s,%s)"%(tr.get_full_local_path(),str(e)))
def run(selections,args): # BEWARE: tricky statement # # 'upgrade' is a multi-selections 'subcommand' which do the same as the # mono-selection 'install' subcommand, but for many selections. What we do # here is replace 'upgrade' subcommand with 'install' subcommand, so that we can, # now that we are in 'upgrade' func/context, # come back to the existing mono-selection func, # for each selection, with 'install' subcommand. # args.subcommand='install' # force non-interactive mode args.yes=True exclude_selection_files=get_exclude(args) for selection in selections: if selection.filename in exclude_selection_files: continue try: sdlog.info("SDUPGRAD-003","Process %s.."%selection.filename,stdout=True) install(args,selection) except sdexception.IncorrectParameterException,e: sdlog.error("SDUPGRAD-004","Error occurs while processing %s (%s)"%(selection.filename,str(e)),stderr=True) except sdexception.SDException,e: sdlog.error("SDUPGRAD-008","Error occurs while processing %s (%s)"%(selection.filename,str(e)),stderr=True)
def transfers_begin(transfers): # renew certificate if needed try: sdlogon.renew_certificate(sdconfig.openid,sdconfig.password,force_renew_certificate=False) except Exception,e: sdlog.error("SDDMDEFA-502","Exception occured while retrieving certificate (%s)"%str(e)) raise
def start_transfer_script(cls,tr): # renew certificate if needed try: sdlogon.renew_certificate(False) except Exception,e: sdlog.error("SDDMDEFA-502","Exception occured while retrieving certificate (%s)"%str(e)) raise
def run(squeries,metadata,parallel): datasets_attrs=None try: datasets_attrs=get_datasets_attrs(squeries,parallel) except MissingDatasetUrlException,e: sdlog.error("SDADDDSA-108","Datasets cannot be set as dataset url is missing") return metadata
def add_dataset_timestamp(squeries,files,parallel): datasets_timestamps=None try: datasets_timestamps=get_datasets_timestamps(squeries,parallel) except MissingDatasetTimestampUrlException,e: sdlog.error("SYNDABTI-600","Datasets timestamps cannot be set as dataset_timestamp_url is missing") return files
def renew_certificate(openid,password,force_renew_certificate=False,force_renew_ca_certificates=False): """Renew ESGF certificate using sdmyproxy module.""" # extract info from openid try: (hostname,port,username)=sdopenid.extract_info_from_openid(openid) except Exception,e: sdlog.error("SYDLOGON-800","Exception occured while processing openid (%s)"%str(e)) raise
def run(squeries,metadata,parallel): datasets_timestamps=None try: datasets_timestamps=get_datasets_timestamps(squeries,parallel) except MissingDatasetTimestampUrlException,e: sdlog.error("SYNDABTI-600","Datasets timestamps cannot be set as dataset_timestamp_url is missing") return metadata
def event(events): try: sdlog.info("SDPPPROX-001","Push events to postprocessing") serialized_events=[e.__dict__ for e in events] # transform list of event to list of dict (needed, because custom class cannot be serialized to JSON) get_service().event(serialized_events) # send events sdlog.info("SDPPPROX-002","%i events successfully transmitted to postprocessing"%len(serialized_events)) except urllib2.URLError,e: sdlog.error("SDPPPROX-010","Network error occured (url=%s,port=%s,%s)"%(url,port,str(e))) raise RemoteException("SDPPPROX-100","Network error occured")
def transfers_begin(transfers): # Activate the destination endpoint _, _, access_token = api_client.goauth.get_access_token(username=globus_username, password=globus_password) api = api_client.TransferAPIClient(username=globus_username, goauth=access_token) activate_endpoint(api) # Divide all files that are to be transferred into groups based on the source globus endpoint globus_transfers = {} for tr in transfers: src_endpoint, src_path, path = map_to_globus(tr.url) local_path = tr.get_full_local_path() if not src_endpoint in globus_transfers: globus_transfers[src_endpoint] = { 'src_endpoint': src_endpoint, 'items': [] } globus_transfers[src_endpoint]['items'].append({ 'src_path': src_path, 'dst_path': local_path, 'tr': tr }) sdlog.info("SDDMGLOB-001", "src_endpoint: %s, src_path: %s, local_path: %s" % (src_endpoint, src_path, local_path)) # Submit transfers for src_endpoint in globus_transfers: # Activate the source endpoint activate_endpoint(api, src_endpoint) # Create a transfer and add files to the transfer code, message, data = api.transfer_submission_id() if code != 200: raise FatalException() submission_id = data['value'] t = api_client.Transfer(submission_id, src_endpoint, dst_endpoint) sdlog.info("SDDMGLOB-004", "Globus transfer, source endpoint: %s, destination endpoint: %s" % (src_endpoint, dst_endpoint)) for item in globus_transfers[src_endpoint]['items']: t.add_item(item['src_path'], item['dst_path']) sdlog.info("SDDMGLOB-005", "Globus transfer item, source path: %s, destination path: %s" % (item['src_path'], item['dst_path'])) # Submit the transfer code, message, data = api.transfer(t) if code != 202: sdlog.error("SDDMGLOB-006","Error: Cannot add a transfer: (%s, %s)"% (code, message)) raise FatalException() task_id = data['task_id'] sdlog.info("SDDMGLOB-007", "Submitted Globus task, id: %s" % task_id) globus_tasks[task_id] = globus_transfers[src_endpoint]
def immediate_md_delete(tr): """Delete file (metadata only).""" sdlog.info("SDDELETE-080", "Delete metadata (%s)" % tr.get_full_local_path()) try: sdfiledao.delete_file(tr, commit=False) except Exception, e: sdlog.error( "SDDELETE-128", "Error occurs during file metadata suppression (%s,%s)" % (tr.get_full_local_path(), str(e)))
def run(self): try: self._service.run(self._instance) self._queue.put(self._instance) # add item in queue to handle database I/O in the main process except CertificateRenewalException, e: sdlog.error("SDWUTILS-003","Certificate error: the daemon must be stopped") sdlog.error("SDWUTILS-001","Thread didn't complete successfully") # no need to log stacktrace here as exception is already logged downstream self._service.exception_occurs=True
def run(self): try: self._service.run(self._instance) self._queue.put(self._instance) # add item in queue to handle database I/O in the main process except: sdlog.error("SYDUTILS-024","Thread didn't complete successfully") # debug traceback.print_exc(file=open(sdconfig.stacktrace_log_file,"a")) traceback.print_exc(file=sys.stderr) self._service.exception_occurs=True
def extract_info_from_openid(openid): """Retrieve username,host,port informations from ESGF openID.""" try: xrds_buf=sdnetutils.HTTP_GET(openid,timeout=10) (hostname,port)=parse_XRDS(xrds_buf) username=parse_openid(openid) return (hostname,port,username) except Exception,e: sdlog.error("SDOPENID-200","Error occured while processing OpenID (%s)"%str(e)) raise OpenIDProcessingException('SDOPENID-002','Error occured while processing OpenID')
def run(self): try: self._service.run(self._instance) # calls Download.run() self._queue.put(self._instance) # add item in queue to handle database I/O in the main process except sdexception.CertificateRenewalException, e: # error occured during certificate renewal sdlog.error("SDWUTILS-003","Certificate error: the daemon must be stopped") sdlog.error("SDWUTILS-001","Thread didn't complete successfully") # no need to log stacktrace here as exception is already logged downstream self._service.exception_occurs=True # we always stop daemon in this case, as download can't succeed without a working certificate. TODO: but sometimes, it's just a temporary failure (e.g. DNS failure during openid resolution), so maybe wait for 5 or 6 transfers to fail in a row before stopping the daemon.
def activate_endpoint(api, ep=None): if ep is None: ep = dst_endpoint code, reason, reqs = api.endpoint_activation_requirements(ep, type='delegate_proxy') public_key = reqs.get_requirement_value("delegate_proxy", "public_key") proxy = x509_proxy.create_proxy_from_file(certificate_file, public_key, lifetime_hours=72) reqs.set_requirement_value("delegate_proxy", "proxy_chain", proxy) try: code, reason, result = api.endpoint_activate(ep, reqs) except api_client.APIError as e: sdlog.error("SDDMGLOB-028","Error: Cannot activate the source endpoint: (%s)"% str(e)) raise FatalException()
def event_loop(): global scheduler_state sdlog.info("SDTSCHED-533", "Connected to %s" % sdconfig.db_file, stderr=True) scheduler_state = 2 start_watchdog() cleanup_running_transfer() clear_failed_url() if sdconst.GET_FILES_CACHING: sdfiledao.highest_waiting_priority( True, True) #initializes cache of max priorities scheduler_state = 1 if sdconfig.download: try: if sdconfig.is_openid_set(): # In this mode, we keep retrying if ESGF IDP is not accessible (e.g. if ESGF is down) # # Note # To be practical, a 'systemd reload sdt' command must be implemented # (else, openid change in sdt.conf have no impact until the next # retry, which may be a few hours..). Because currently, synda is not aware # of sdt.conf changes while running. # #sdlogon.renew_certificate_with_retry(sdconfig.openid,sdconfig.password,force_renew_certificate=True) #sdlogon.renew_certificate_with_retry_highfreq(sdconfig.openid,sdconfig.password,force_renew_certificate=True) # In this mode, we stop the daemon if ESGF IDP is not accessible (e.g. if ESGF is down) # sdlogon.renew_certificate(sdconfig.openid, sdconfig.password, force_renew_certificate=True) else: sdlog.error("SDTSCHED-928", 'OpenID not set in configuration file', stderr=True) raise OpenIDNotSetException( "SDTSCHED-264", "OpenID not set in configuration file") except SDException, e: sdlog.error("SDTSCHED-920", "Error occured while retrieving ESGF certificate", stderr=True) raise
class WorkerThread(threading.Thread): """This class is the thread that handle the file transfer.""" def __init__(self, instance, queue, service): threading.Thread.__init__(self) self._queue = queue # the queue where to push the item once work is done to deferre database I/O self._instance = instance # the item being processed self._service = service # the service used to process the item def run(self): try: self._service.run(self._instance) # calls Download.run() self._queue.put( self._instance ) # add item in queue to handle database I/O in the main process except sdexception.CertificateRenewalException, e: # error occured during certificate renewal sdlog.error("SDWUTILS-003", "Certificate error: the daemon must be stopped") sdlog.error("SDWUTILS-001", "Thread didn't complete successfully") # no need to log stacktrace here as exception is already logged downstream self._service.exception_occurs = True # we always stop daemon in this case, as download can't succeed without a working certificate. TODO: but sometimes, it's just a temporary failure (e.g. DNS failure during openid resolution), so maybe wait for 5 or 6 transfers to fail in a row before stopping the daemon. except Exception, e: sdlog.error("SDWUTILS-002", "Thread didn't complete successfully") sdtrace.log_exception(stderr=True) if sdconfig.stop_download_if_error_occurs: self._service.exception_occurs = True
def event(events): try: sdlog.info("SDPPPROX-001", "Push events to postprocessing") serialized_events = [ e.__dict__ for e in events ] # transform list of event to list of dict (needed, because custom class cannot be serialized to JSON) get_service().event(serialized_events) # send events sdlog.info( "SDPPPROX-002", "%i events successfully transmitted to postprocessing" % len(serialized_events)) except urllib2.URLError, e: sdlog.error( "SDPPPROX-010", "Network error occured (url=%s,port=%s,%s)" % (url, port, str(e))) raise RemoteException("SDPPPROX-100", "Network error occured")
def run(self): try: self._service.run(self._instance) # calls Download.run() self._queue.put( self._instance ) # add item in queue to handle database I/O in the main process except sdexception.CertificateRenewalException, e: # error occured during certificate renewal sdlog.error("SDWUTILS-003", "Certificate error: the daemon must be stopped") sdlog.error("SDWUTILS-001", "Thread didn't complete successfully") # no need to log stacktrace here as exception is already logged downstream self._service.exception_occurs = True # we always stop daemon in this case, as download can't succeed without a working certificate. TODO: but sometimes, it's just a temporary failure (e.g. DNS failure during openid resolution), so maybe wait for 5 or 6 transfers to fail in a row before stopping the daemon.
def immediate_delete(tr): """Delete file (metadata and data). Notes - This method remove files but not directories (directories are removed in "cleanup.sh" script) """ sdlog.info("SDDELETE-055","Delete transfer (%s)"%tr.get_full_local_path()) if os.path.isfile(tr.get_full_local_path()): try: os.remove(tr.get_full_local_path()) # note: if data cannot be removed (i.e. exception is raised), we don't remove metadata sdfiledao.delete_file(tr) except Exception,e: sdlog.error("SDDELETE-528","Error occurs during file suppression (%s,%s)"%(tr.get_full_local_path(),str(e)))
def run(i__queries): """ Notes - this method contains the retry mecanism - return files list (not Response object) """ # check for q in i__queries: if sdconst.IDXHOSTMARK not in q['url']: raise SDException('SDPROXMT-044','Incorrect query: host must not be set at this step') # retry max_retry=6 i=0 results=[] l__queries=i__queries while i < max_retry: (tmp_results,failed_queries)=run_helper(l__queries) # MEMO: tmp_results is files list, not a Response object # add OK results for r in tmp_results: results.append(r) if len(failed_queries)>0: sdlog.info("SDPROXMT-082","%d search-API queries failed"%(len(failed_queries),)) sdlog.info("SDPROXMT-083","retry 'failed search-API queries'") l__queries=failed_queries i+=1 continue else: if i>0: sdlog.info("SDPROXMT-089","retry succeeded") break if len(failed_queries)>0: sdlog.error("SDPROXMT-084","max retry iteration reached. %d queries did not succeed"%(len(failed_queries),)) return results # result is a return Response object
def build_selection_list(): """ Return: selections list. """ selections=[] files=build_selection_file_list() # contains selection files path list (fullpath) for file in files: try: buffer=sdbuffer.get_selection_file_buffer(path=file) selection=sdparse.build(buffer) selections.append(selection) except Exception, e: sdlog.error("SDSELGPU-001","Exception occured (%s)"%str(e)) raise SDException("SDSELGPU-001","Error occured while loading '%s' selection file. See log for details."%file)
def run(self,url=None,attached_parameters=None): """Execute one search query (as pagination is used, it can result in many HTTP queries).""" if attached_parameters is None: attached_parameters={} request=sdtypes.Request(url=url,pagination=True) final_url=request.get_url() sdlog.debug("SYDPROXY-490","paginated call started (url=%s)"%final_url) try: paginated_response=self.call_web_service__PAGINATION(request) except Exception,e: sdlog.error("SYDPROXY-400","Error occurs during search-API paginated call (url=%s)"%(final_url,)) sdlog.error("SYDPROXY-410","%s"%(str(e),)) raise
def run(self,url=None,attached_parameters={}): """Execute one search query (as pagination is used, it can result in many HTTP queries) Returns: Response object """ request=Request(url=url,pagination=True) final_url=request.get_url() sdlog.info("SYDPROXY-490","paginated call started (url=%s)"%final_url) try: result=self.call_web_service__PAGINATION(request) # return Response object except Exception,e: sdlog.error("SYDPROXY-400","Error occurs during search-API paginated call (url=%s)"%(final_url,)) sdlog.error("SYDPROXY-410","%s"%(str(e),)) raise
def start_transfer_script(cls,tr): # renew certificate if needed try: sdlogon.renew_certificate(False) except: sdlog.error("SDDOWNLO-504","Certificate error: the daemon must be stopped") raise checksum_type=tr.checksum_type if tr.checksum_type is not None else 'md5' (tr.sdget_status,local_checksum,killed,script_stdxxx)=sdget.download(tr.url,tr.get_full_local_path(),checksum_type) if tr.sdget_status==0: tr.status=sdconst.TRANSFER_STATUS_DONE assert tr.size is not None if int(tr.size) != os.path.getsize(tr.get_full_local_path()): sdlog.error("SDDOWNLO-002","size don't match (remote_size=%i,local_size=%i,local_path=%s)"%(int(tr.size),os.path.getsize(tr.get_full_local_path()),tr.get_full_local_path())) # retrieve remote checksum remote_checksum=tr.checksum if remote_checksum!=None: # remote checksum exists # compare local and remote checksum if remote_checksum==local_checksum: # checksum is ok, nothing to do pass else: # checksum is not ok if incorrect_checksum_action=="remove": tr.status=sdconst.TRANSFER_STATUS_ERROR tr.error_msg="File corruption detected: local checksum doesn't match remote checksum" # remove file from local repository sdlog.error("SDDOWNLO-155","checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)"%(local_checksum,remote_checksum,tr.get_full_local_path())) try: os.remove(tr.get_full_local_path()) except Exception,e: sdlog.error("SDDOWNLO-158","error occurs while removing local file (%s)"%tr.get_full_local_path()) elif incorrect_checksum_action=="keep": sdlog.info("SDDOWNLO-157","local checksum doesn't match remote checksum (%s)"%tr.get_full_local_path()) tr.status=sdconst.TRANSFER_STATUS_DONE tr.error_msg="" else: raise SDException("SDDOWNLO-507","incorrect value (%s)"%incorrect_checksum_action)
def stop(): if is_running(): pid=pidfile.read_pid() if psutil.pid_exists(pid): os.kill(pid,signal.SIGTERM) else: import sdlog # sdlog import must not be at the top of this file, because of double-fork sdlog.error('SDDAEMON-014',"Warning: daemon pidfile exists but daemon process doesn't exist. Most often, this is caused by an unexpected system restart (e.g. kernel panic).") # remove orphan pidfile sdlog.info('SDDAEMON-016',"Removing orphan daemon pidfile (%s)."%sdconfig.daemon_pid_file) os.unlink(sdconfig.daemon_pid_file) else: sdtools.print_stderr('Daemon is already stopped.')
def activate_endpoint(api, ep=None): if ep is None: ep = dst_endpoint code, reason, reqs = api.endpoint_activation_requirements( ep, type='delegate_proxy') public_key = reqs.get_requirement_value("delegate_proxy", "public_key") proxy = x509_proxy.create_proxy_from_file(sdconfig.esgf_x509_proxy, public_key, lifetime_hours=72) reqs.set_requirement_value("delegate_proxy", "proxy_chain", proxy) try: code, reason, result = api.endpoint_activate(ep, reqs) except api_client.APIError as e: sdlog.error( "SDDMGLOB-028", "Error: Cannot activate the source endpoint: (%s)" % str(e)) raise FatalException()
def is_file_complete(file): filename=file.get("title") if file.get("dataset_id") is None: sdlog.error("SDREDUCE-002","incorrect dataset_id (filename=%s)"%filename) return False if file.get("url_http") is None: # memo: 'url_<proto>' is renamed to 'url' in a downstream step (in sdprotocol) sdlog.error("SDREDUCE-001","Incorrect url_http (%s)"%filename) return False """ if file.get("tracking_id") is None: sdlog.error("SDREDUCE-010","incorrect tracking_id (%s)"%filename) return False """ return True
def delete_transfers(limit=None, remove_all=True): """Perform the deletion of DATA and METADATA. Returns how many files with TRANSFER_STATUS_DELETE status remain Notes - Can be called from the daemon code (deferred mode), or from interactive code (immediate mode). - 'limit' is used to delete only a subset of all files marked for deletion each time this func is called. If 'limit' is None, all files marked for deletion are removed. """ transfer_list = sdfiledao.get_files(status=sdconst.TRANSFER_STATUS_DELETE, limit=limit) try: for tr in transfer_list: if remove_all: immediate_delete(tr) else: immediate_md_delete(tr) sddb.conn.commit( ) # final commit (we do all deletion in one transaction). except Exception as e: sdlog.error("SDDELETE-880", "Error occurs during files suppression (%s)" % (str(e), )) # no rollback here: i.e. we also commit if error occur (most likely a # filesystem permission error). This is to keep medatata synced with # data (else many files would have # been removed from filesystem but with metadata still in db..). # # TODO: exception is too generic here: # improve this code by using a specific exception for "permission error". # sddb.conn.commit() raise # fatal error return sdfilequery.transfer_status_count( status=sdconst.TRANSFER_STATUS_DELETE)
def extract_info_from_openid(openid): """Retrieve username,host,port informations from ESGF openID.""" # openid check (see #44 for more info) for openid_host in invalid_openids: if openid_host in openid: sdlog.warning("SDOPENID-210", "Invalid openid (%s)" % openid) try: xrds_buf = sdnetutils.HTTP_GET_2(openid, timeout=10, verify=False) (hostname, port) = parse_XRDS(xrds_buf) username = parse_openid(openid) return (hostname, port, username) except Exception, e: sdlog.error("SDOPENID-200", "Error occured while processing OpenID (%s)" % str(e)) raise OpenIDProcessingException( 'SDOPENID-002', 'Error occured while processing OpenID')
def run(host, port, username, force_renew_certificate=False, force_renew_ca_certificates=False, password=None): # use passwd from passwd file if exists passwd = get_passwd_from_passwd_file() if passwd is not None: password = passwd # check password if password == "pwd": sdlog.error("SDMYPROX-019", "ESGF password not set") raise sdexception.PasswordNotSetException() # check username if username is None: sdlog.error("SDMYPROX-020", "ESGF username not set") raise sdexception.UsernameNotSetException() if force_renew_certificate: if os.path.isfile(sdconfig.esgf_x509_proxy): os.unlink(sdconfig.esgf_x509_proxy) if force_renew_ca_certificates: if os.path.isdir(sdconfig.esgf_x509_cert_dir): shutil.rmtree(sdconfig.esgf_x509_cert_dir) if certificate_exists(): if certificate_is_valid(): #sdlog.error("SDMYPROX-006","Certificate is valid, nothing to do") pass else: renew_certificate(host, port, username, password) else: renew_certificate(host, port, username, password) # check (second pass => if it fails again, then fatal error) if not certificate_exists(): sdlog.error("SDMYPROX-009", "Error occured while retrieving certificate") raise sdexception.MissingCertificateException() else: os.chmod(sdconfig.esgf_x509_proxy, 0600) # needed by globus-url-copy if not certificate_is_valid(): sdlog.error("SDMYPROX-010", "Error occurs while retrieving certificate") raise sdexception.InvalidCertificateException()
def event_loop(): global scheduler_state sdlog.info("SDTSCHED-533","Connected to %s"%sdconfig.db_file,stderr=True) scheduler_state=2 start_watchdog() cleanup_running_transfer() scheduler_state=1 if sdconfig.files_download: try: if sdlogon.is_openid_set(): # In this mode, we keep retrying if ESGF IDP is not accessible (e.g. if ESGF is down) # # Note # To be practical, a 'systemd reload sdt' command must be implemented # (else, openid change in sdt.conf have no impact until the next # retry, which may be a few hours..). Because currently, synda is not aware # of sdt.conf changes while running. # #sdlogon.renew_certificate_with_retry(True) #sdlogon.renew_certificate_with_retry_highfreq() # In this mode, we stop the daemon if ESGF IDP is not accessible (e.g. if ESGF is down) # sdlogon.renew_certificate(True) else: sdlog.error("SDTSCHED-928",'OpenID not set in configuration file',stderr=True) raise CertificateRenewalException("SDTSCHED-264","OpenID not set in configuration file") except SDException,e: sdlog.error("SDTSCHED-920","Error occured while retrieving ESGF certificate",stderr=True) raise
def immediate_delete(tr): """Delete file (metadata and data). Notes - This method remove files but not directories (directories are removed in "cleanup.sh" script) """ sdlog.info("SDDELETE-055", "Delete transfer (%s)" % tr.get_full_local_path()) if os.path.isfile(tr.get_full_local_path()): try: os.remove(tr.get_full_local_path()) # note: if data cannot be removed (i.e. exception is raised), we don't remove metadata sdfiledao.delete_file(tr, commit=False) except Exception, e: sdlog.error( "SDDELETE-528", "Error occurs during file suppression (%s,%s)" % (tr.get_full_local_path(), str(e))) raise
def run(self, url=None, attached_parameters=None): """Execute one search query (as pagination is used, it can result in many HTTP queries).""" if attached_parameters is None: attached_parameters = {} request = sdtypes.Request(url=url, pagination=True) final_url = request.get_url() sdlog.debug("SYDPROXY-490", "paginated call started (url=%s)" % final_url) try: paginated_response = self.call_web_service__PAGINATION(request) except Exception, e: sdlog.error( "SYDPROXY-400", "Error occurs during search-API paginated call (url=%s)" % (final_url, )) sdlog.error("SYDPROXY-410", "%s" % (str(e), )) raise
def build_selection_list(): """ Return: selections list. """ selections = [] files = build_selection_file_list( ) # contains selection files path list (fullpath) for file in files: try: buffer = sdbuffer.get_selection_file_buffer(path=file) selection = sdparse.build(buffer) selections.append(selection) except Exception, e: sdlog.error("SDSELGPU-001", "Exception occured (%s)" % str(e)) raise SDException( "SDSELGPU-001", "Error occured while loading '%s' selection file. See log for details." % file)
def check_if_frozen(p,pid,local_path,previous_processes,new_processes): key="%s%s"%(pid,local_path) fsize=os.path.getsize(local_path) if key in previous_processes: previous_size=previous_processes[key] # is size the same ? (means wget stalled) if fsize==previous_size: # not good sdlog.error("SDWATCHD-275","wget is stalled (%s,%s)"%(local_path,pid)) p.terminate() else: # wget works perfectly, no pb #sdlog.debug("SDWATCHD-276","wget is not stalled (%s,%s)"%(local_path,pid)) pass else: # new wget in town new_processes[key]=fsize
def delete_transfers(limit=None,remove_all=True): """Perform the deletion of DATA and METADATA. Returns how many files with TRANSFER_STATUS_DELETE status remain Notes - Can be called from the daemon code (deferred mode), or from interactive code (immediate mode). - 'limit' is used to delete only a subset of all files marked for deletion each time this func is called. If 'limit' is None, all files marked for deletion are removed. """ transfer_list=sdfiledao.get_files(status=sdconst.TRANSFER_STATUS_DELETE,limit=limit) try: for tr in transfer_list: if remove_all: immediate_delete(tr) else: immediate_md_delete(tr) sddb.conn.commit() # final commit (we do all deletion in one transaction). except Exception as e: sdlog.error("SDDELETE-880","Error occurs during files suppression (%s)"%(str(e),)) # no rollback here: i.e. we also commit if error occur (most likely a # filesystem permission error). This is to keep medatata synced with # data (else many files would have # been removed from filesystem but with metadata still in db..). # # TODO: exception is too generic here: # improve this code by using a specific exception for "permission error". # sddb.conn.commit() raise # fatal error return sdfilequery.transfer_status_count(status=sdconst.TRANSFER_STATUS_DELETE)
def renew_certificate(force_renew_certificate,quiet=True,debug=False,force_renew_ca_certificates=False): """Renew ESGF certificate.""" # TODO: move this log into the script so to print only when expired #sdlog.info("SYDLOGON-002","Renew certificate..") (hostname,port,username)=sdopenid.extract_info_from_openid(openid) argv=[sdconfig.logon_script,'-h',hostname,'-p',port,'-s',sdconfig.security_dir,'-u',username] if not quiet: argv.append('-v') if force_renew_certificate: argv.append('-r') if force_renew_ca_certificates: argv.append('-x') (status,stdout,stderr)=sdutils.get_status_output(argv) if status!=0: # print script stdxxx output (useful to debug certificate problem) if quiet: with open(sdconfig.stacktrace_log_file,'a') as fh: fh.write("'%s' script returned an error\n"%os.path.basename(sdconfig.logon_script)) fh.write('status=%s\nstdout=%s\nstderr=%s\n'%(status,stdout.rstrip(os.linesep),stderr.rstrip(os.linesep))) else: print_stderr("'%s' script returned an error\n"%os.path.basename(sdconfig.logon_script)) print_stderr('status=%s\nstdout=%s\nstderr=%s\n'%(status,stdout.rstrip(os.linesep),stderr.rstrip(os.linesep))) sdlog.error("SYDLOGON-040","Exception occured while retrieving certificate (status=%i)"%status) raise CertificateRenewalException("SYDLOGON-001","Cannot retrieve certificate from ESGF (hostname=%s,port=%s)"%(hostname,port)) else: if debug: print_stderr("'%s' script stdxxx (debug mode)\n"%os.path.basename(sdconfig.logon_script)) print_stderr('status=%s\nstdout=%s\nstderr=%s\n'%(status,stdout.rstrip(os.linesep),stderr.rstrip(os.linesep)))
def check_if_frozen(p, pid, local_path, previous_processes, new_processes): key = "%s%s" % (pid, local_path) fsize = os.path.getsize(local_path) if key in previous_processes: previous_size = previous_processes[key] # is size the same ? (means wget stalled) if fsize == previous_size: # not good sdlog.error("SDWATCHD-275", "wget is stalled (%s,%s)" % (local_path, pid)) p.terminate() else: # wget works perfectly, no pb #sdlog.debug("SDWATCHD-276","wget is not stalled (%s,%s)"%(local_path,pid)) pass else: # new wget in town new_processes[key] = fsize
def stop(): if is_running(): pid = pidfile.read_pid() if psutil.pid_exists(pid): os.kill(pid, signal.SIGTERM) else: import sdlog # sdlog import must not be at the top of this file, because of double-fork sdlog.error( 'SDDAEMON-014', "Warning: daemon pidfile exists but daemon process doesn't exist. Most often, this is caused by an unexpected system restart (e.g. kernel panic)." ) # remove orphan pidfile sdlog.info( 'SDDAEMON-016', "Removing orphan daemon pidfile (%s)." % sdconfig.daemon_pid_file) os.unlink(sdconfig.daemon_pid_file) else: sdtools.print_stderr('Daemon is already stopped.')
def run(i__queries): """This method contains the retry mecanism.""" # check for q in i__queries: if sdconst.IDXHOSTMARK not in q['url']: raise sdexception.SDException('SDPROXMT-044','Incorrect query: host must not be set at this step') # retry loop max_retry=6 i=0 metadata=sdtypes.Metadata() l__queries=i__queries while i < max_retry: (success,errors)=run_helper(l__queries) metadata.slurp(success) # warning: success is modified here if len(errors)>0: sdlog.info("SDPROXMT-082","%d search-API queries failed"%(len(errors),)) sdlog.info("SDPROXMT-083","retry 'failed search-API queries'") l__queries=errors i+=1 continue else: if i>0: sdlog.info("SDPROXMT-089","retry succeeded") break if len(errors)>0: sdlog.error("SDPROXMT-084","max retry iteration reached. %d queries did not succeed"%(len(errors),)) return metadata
def transfers_end(): _, _, access_token = api_client.goauth.get_access_token( username=globus_username, password=globus_password) api = api_client.TransferAPIClient(username=globus_username, goauth=access_token) for task_id in globus_tasks: code, reason, data = api.task(task_id, fields="status") status = data['status'] sdlog.debug( "SDDMGLOB-016", "Checking the status of Globus transfer tasks, id: %s, status: %s" % (task_id, status)) for item in globus_tasks[task_id]['items']: tr = item['tr'] if status == "SUCCEEDED": assert tr.size is not None if int(tr.size) != os.path.getsize(tr.get_full_local_path()): sdlog.error( "SDDMGLOB-002", "size don't match (remote_size=%i,local_size=%i,local_path=%s)" % (int(tr.size), os.path.getsize(tr.get_full_local_path()), tr.get_full_local_path())) # retrieve local and remote checksum checksum_type = tr.checksum_type if tr.checksum_type is not None else sdconst.CHECKSUM_TYPE_MD5 local_checksum = sdutils.compute_checksum( tr.get_full_local_path(), checksum_type) remote_checksum = tr.checksum # retrieve remote checksum if remote_checksum != None: # remote checksum exists # compare local and remote checksum if remote_checksum == local_checksum: # checksum is ok tr.status = sdconst.TRANSFER_STATUS_DONE else: # checksum is not ok if incorrect_checksum_action == "remove": tr.status = sdconst.TRANSFER_STATUS_ERROR tr.priority -= 1 tr.error_msg = "File corruption detected: local checksum doesn't match remote checksum" # remove file from local repository sdlog.error( "SDDMGLOB-155", "checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)" % (local_checksum, remote_checksum, tr.get_full_local_path())) try: os.remove(tr.get_full_local_path()) except Exception, e: sdlog.error( "SDDMGLOB-158", "error occurs while removing local file (%s)" % tr.get_full_local_path()) elif incorrect_checksum_action == "keep": sdlog.info( "SDDMGLOB-157", "local checksum doesn't match remote checksum (%s)" % tr.get_full_local_path()) tr.status = sdconst.TRANSFER_STATUS_DONE else: raise FatalException( "SDDMGLOB-507", "incorrect value (%s)" % incorrect_checksum_action) else: # remote checksum is missing # NOTE: we DON'T store the local checksum ('file' table contains only the REMOTE checksum) tr.status = sdconst.TRANSFER_STATUS_DONE if tr.status == sdconst.TRANSFER_STATUS_DONE: tr.end_date = sdtime.now( ) # WARNING: this is not the real end of transfer date but the date when we ask the globus scheduler if the transfer is done. tr.error_msg = "" sdlog.info("SDDMGLOB-101", "Transfer done (%s)" % str(tr)) elif status == "FAILED": tr.status = sdconst.TRANSFER_STATUS_ERROR tr.priority -= 1 tr.error_msg = "Error occurs during download." sdlog.info("SDDMGLOB-101", "Transfer failed (%s)" % str(tr)) # Remove local file if exists if os.path.isfile(tr.get_full_local_path()): try: os.remove(tr.get_full_local_path()) except Exception, e: sdlog.error( "SDDMGLOB-528", "Error occurs during file suppression (%s,%s)" % (tr.get_full_local_path(), str(e)))
def transfers_begin(transfers): # Activate the destination endpoint _, _, access_token = api_client.goauth.get_access_token( username=globus_username, password=globus_password) api = api_client.TransferAPIClient(username=globus_username, goauth=access_token) activate_endpoint(api) # Divide all files that are to be transferred into groups based on the source globus endpoint globus_transfers = {} for tr in transfers: src_endpoint, src_path, path = map_to_globus(tr.url) local_path = tr.get_full_local_path() if not src_endpoint in globus_transfers: globus_transfers[src_endpoint] = { 'src_endpoint': src_endpoint, 'items': [] } globus_transfers[src_endpoint]['items'].append({ 'src_path': src_path, 'dst_path': local_path, 'tr': tr }) sdlog.info( "SDDMGLOB-001", "src_endpoint: %s, src_path: %s, local_path: %s" % (src_endpoint, src_path, local_path)) # Submit transfers for src_endpoint in globus_transfers: # Activate the source endpoint activate_endpoint(api, src_endpoint) # Create a transfer and add files to the transfer code, message, data = api.transfer_submission_id() if code != 200: raise FatalException() submission_id = data['value'] t = api_client.Transfer(submission_id, src_endpoint, dst_endpoint) sdlog.info( "SDDMGLOB-004", "Globus transfer, source endpoint: %s, destination endpoint: %s" % (src_endpoint, dst_endpoint)) for item in globus_transfers[src_endpoint]['items']: t.add_item(item['src_path'], item['dst_path']) sdlog.info( "SDDMGLOB-005", "Globus transfer item, source path: %s, destination path: %s" % (item['src_path'], item['dst_path'])) # Submit the transfer code, message, data = api.transfer(t) if code != 202: sdlog.error( "SDDMGLOB-006", "Error: Cannot add a transfer: (%s, %s)" % (code, message)) raise FatalException() task_id = data['task_id'] sdlog.info("SDDMGLOB-007", "Submitted Globus task, id: %s" % task_id) globus_tasks[task_id] = globus_transfers[src_endpoint]
def start_transfer_script(cls,tr): if sdconfig.fake_download: tr.status=sdconst.TRANSFER_STATUS_DONE tr.error_msg="" tr.sdget_error_msg="" return (tr.sdget_status,killed,tr.sdget_error_msg)=sdget.download(tr.url, tr.get_full_local_path(), debug=False, http_client=sdconst.HTTP_CLIENT_WGET, timeout=sdconst.ASYNC_DOWNLOAD_HTTP_TIMEOUT, verbosity=0, buffered=True, hpss=hpss) if tr.sdget_status==0: assert tr.size is not None if int(tr.size) != os.path.getsize(tr.get_full_local_path()): sdlog.error("SDDMDEFA-002","size don't match (remote_size=%i,local_size=%i,local_path=%s)"%(int(tr.size),os.path.getsize(tr.get_full_local_path()),tr.get_full_local_path())) # retrieve remote checksum remote_checksum=tr.checksum if remote_checksum!=None: # remote checksum exists # compute local checksum checksum_type=tr.checksum_type if tr.checksum_type is not None else sdconst.CHECKSUM_TYPE_MD5 # fallback to 'md5' (arbitrary) local_checksum=sdutils.compute_checksum(tr.get_full_local_path(),checksum_type) # compare local and remote checksum if remote_checksum==local_checksum: # checksum is ok tr.status=sdconst.TRANSFER_STATUS_DONE tr.error_msg="" else: # checksum is not ok if incorrect_checksum_action=="remove": tr.status=sdconst.TRANSFER_STATUS_ERROR tr.error_msg="File corruption detected: local checksum doesn't match remote checksum" # remove file from local repository sdlog.error("SDDMDEFA-155","checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)"%(local_checksum,remote_checksum,tr.get_full_local_path())) try: os.remove(tr.get_full_local_path()) except Exception,e: sdlog.error("SDDMDEFA-158","error occurs while removing local file (%s)"%tr.get_full_local_path()) elif incorrect_checksum_action=="keep": sdlog.info("SDDMDEFA-157","local checksum doesn't match remote checksum (%s)"%tr.get_full_local_path()) tr.status=sdconst.TRANSFER_STATUS_DONE tr.error_msg="" else: raise sdexception.FatalException("SDDMDEFA-507","incorrect value (%s)"%incorrect_checksum_action)
sdlog.info("SDTSCHED-902", "Transfer daemon is now up and running", stderr=True) while True: evlp0 = SDTimer.get_time() assert os.path.isfile(sdconfig.daemon_pid_file) if quit == 0: run_soft_tasks() run_hard_tasks() if sdtask.fatal_exception(): sdlog.error("SDTSCHED-002", "Fatal exception occured during download", stderr=True) break if quit == 1: if can_leave( ): # wait until all threads finish and until everything has been processed on the database I/O queue sdlog.info("SDTSCHED-001", "eot_queue orders processing completed", stderr=False) sdlog.info("SDTSCHED-003", "Running transfer processing completed", stderr=False) break time.sleep(main_loop_sleep)
else: raise sdexception.FatalException("SDDMDEFA-507","incorrect value (%s)"%incorrect_checksum_action) else: # remote checksum is missing # NOTE: we DON'T store the local checksum ('file' table contains only the *remote* checksum) tr.status=sdconst.TRANSFER_STATUS_DONE tr.error_msg="" else: # Remove file if exists if os.path.isfile(tr.get_full_local_path()): try: os.remove(tr.get_full_local_path()) except Exception,e: sdlog.error("SDDMDEFA-528","Error occurs during file suppression (%s,%s)"%(tr.get_full_local_path(),str(e))) # Set status if killed: # OLD WAY #tr.status=sdconst.TRANSFER_STATUS_WAITING #tr.error_msg="Error occurs during download (killed). Transfer marked for retry." # NEW WAY (TAG4JK4JJJ4454) # # We do not switch to 'waiting' anymore in this case, because # most often, process is killed by the watchdog for good # reason (e.g. the transfer process is frozen because of a # non-fixable server side problem). #
- when the daemon is stopped, this retry is cancelled using SIGTERM (seems not working for now as it only stops on 'kill -9' TBC) """ renew_certificate(openid,password,force_renew_certificate=force_renew_certificate) def renew_certificate(openid,password,force_renew_certificate=False,force_renew_ca_certificates=False): """Renew ESGF certificate using sdmyproxy module.""" # extract info from openid try: (hostname,port,username)=sdopenid.extract_info_from_openid(openid) except Exception,e: sdlog.error("SYDLOGON-800","Exception occured while processing openid (%s)"%str(e)) raise try: sdmyproxy.run(hostname,port,username,force_renew_certificate,force_renew_ca_certificates,password) except Exception,e: sdlog.error("SYDLOGON-012","Error occured while retrieving certificate from myproxy server (%s)"%str(e)) raise # init. if __name__ == '__main__': parser = argparse.ArgumentParser() args = parser.parse_args() renew_certificate(sdconfig.openid,sdconfig.password,force_renew_certificate=True) print_stderr("Certificate successfully renewed")