def _poll_transfers(external_host, transfers_by_eid, vo, timeout, logger): """ Helper function for poll_transfers which performs the actual polling and database update. """ is_bulk = len(transfers_by_eid) > 1 try: tss = time.time() logger( logging.INFO, 'Polling %i transfers against %s with timeout %s' % (len(transfers_by_eid), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, transfers_by_eid, TRANSFER_TOOL, vo, timeout, logger) duration = time.time() - tss record_timer('daemons.conveyor.poller.bulk_query_transfers', duration * 1000 / len(transfers_by_eid)) logger( logging.DEBUG, 'Polled %s transfer requests status in %s seconds' % (len(transfers_by_eid), duration)) except TransferToolTimeout as error: logger(logging.ERROR, str(error)) return except TransferToolWrongAnswer as error: logger(logging.ERROR, str(error)) if is_bulk: raise # The calling context will retry transfers one-by-one else: return except RequestException as error: logger(logging.ERROR, "Failed to contact FTS server: %s" % (str(error))) return except Exception: logger(logging.ERROR, "Failed to query FTS info", exc_info=True) return tss = time.time() logger(logging.DEBUG, 'Updating %s transfer requests status' % (len(transfers_by_eid))) cnt = 0 request_ids = { t['request_id'] for t in itertools.chain.from_iterable(transfers_by_eid.values()) } for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: for transfer in transfers_by_eid[transfer_id]: resp = transfer_core.fake_transfertool_response( transfer, new_state=RequestState.LOST, reason="The FTS job lost") request_core.update_request_state(resp, logger=logger) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logger( logging.WARNING, "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter( 'daemons.conveyor.poller.query_transfer_exception') else: for request_id in request_ids.intersection(transf_resp): ret = request_core.update_request_state( transf_resp[request_id], logger=logger) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter( 'daemons.conveyor.poller.update_request_state.{updated}', labels={'updated': ret}) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger( logging.WARNING, "Lock detected when handling request %s - skipping" % transfer_id) else: logger(logging.ERROR, 'Exception', exc_info=True) logger( logging.DEBUG, 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(transfers_by_eid), cnt, (time.time() - tss)))
def poll_transfers(external_host, xfers, prepend_str='', request_ids=None, timeout=None): """ Poll a list of transfers from an FTS server :param external_host: The FTS server to query from. :param xfrs: List of transfers to poll. :param process: Process number. :param thread: Thread number. :param timeout: Timeout. """ try: if TRANSFER_TOOL == 'mock': logging.debug(prepend_str + 'Setting %s transfer requests status to DONE per mock tool' % (len(xfers))) for task_id in xfers: ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=RequestState.DONE) record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) return try: tss = time.time() logging.info(prepend_str + 'Polling %i transfers against %s with timeout %s' % (len(xfers), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, xfers, TRANSFER_TOOL, timeout) record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers)) except TransferToolTimeout as error: logging.error(prepend_str + str(error)) return except TransferToolWrongAnswer as error: logging.error(prepend_str + str(error)) logging.error(prepend_str + 'Problem querying %s on %s. All jobs are being checked individually' % (str(xfers), external_host)) for xfer in xfers: try: logging.debug(prepend_str + 'Checking %s on %s' % (xfer, external_host)) status = transfer_core.bulk_query_transfers(external_host, [xfer, ], TRANSFER_TOOL, timeout) if xfer in status and isinstance(status[xfer], Exception): logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(status[xfer]))) except Exception as err: logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(err))) break return except RequestException as error: logging.error(prepend_str + "Failed to contact FTS server: %s" % (str(error))) return except Exception: logging.error(prepend_str + "Failed to query FTS info: %s" % (traceback.format_exc())) return logging.debug(prepend_str + 'Polled %s transfer requests status in %s seconds' % (len(xfers), (time.time() - tss))) tss = time.time() logging.debug(prepend_str + 'Updating %s transfer requests status' % (len(xfers))) cnt = 0 if TRANSFER_TOOL == 'globus': for task_id in resps: ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=resps[task_id]) record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) else: for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: transfer_core.update_transfer_state(external_host, transfer_id, RequestState.LOST, logging_prepend_str=prepend_str) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logging.warning(prepend_str + "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter('daemons.conveyor.poller.query_transfer_exception') else: for request_id in transf_resp: if request_id in request_ids: ret = request_core.update_request_state(transf_resp[request_id], logging_prepend_str=prepend_str) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logging.warn(prepend_str + "Lock detected when handling request %s - skipping" % request_id) else: logging.error(traceback.format_exc()) logging.debug(prepend_str + 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(xfers), cnt, (time.time() - tss))) except Exception: logging.error(traceback.format_exc())
def poll_transfers(external_host, xfers, prepend_str='', request_ids=None, timeout=None): """ Poll a list of transfers from an FTS server :param external_host: The FTS server to query from. :param xfrs: List of transfers to poll. :param process: Process number. :param thread: Thread number. :param timeout: Timeout. """ try: try: tss = time.time() logging.info(prepend_str + 'Polling %i transfers against %s with timeout %s' % (len(xfers), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, xfers, 'fts3', timeout) record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers)) except RequestException as error: logging.error(prepend_str + "Failed to contact FTS server: %s" % (str(error))) return except Exception: logging.error(prepend_str + "Failed to query FTS info: %s" % (traceback.format_exc())) return logging.debug(prepend_str + 'Polled %s transfer requests status in %s seconds' % (len(xfers), (time.time() - tss))) tss = time.time() logging.debug(prepend_str + 'Updating %s transfer requests status' % (len(xfers))) cnt = 0 for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: transfer_core.update_transfer_state( external_host, transfer_id, RequestState.LOST, logging_prepend_str=prepend_str) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logging.warning(prepend_str + "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter( 'daemons.conveyor.poller.query_transfer_exception') else: for request_id in transf_resp: if request_id in request_ids: ret = request_core.update_request_state( transf_resp[request_id], logging_prepend_str=prepend_str) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter( 'daemons.conveyor.poller.update_request_state.%s' % ret) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if isinstance(error.args[0], tuple) and ( match('.*ORA-00054.*', error.args[0][0]) or match('.*ORA-00060.*', error.args[0][0]) or ('ERROR 1205 (HY000)' in error.args[0][0])): logging.warn( prepend_str + "Lock detected when handling request %s - skipping" % request_id) else: logging.error(traceback.format_exc()) logging.debug( prepend_str + 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(xfers), cnt, (time.time() - tss))) except Exception: logging.error(traceback.format_exc())