Пример #1
0
 def _getDuration(self, start, end):
     startTime = self._getDateTime(start)
     endTime = self._getDateTime(end)
     if startTime >= endTime:
         logger.warning("startTime is after endTime")
         return 1 ##TODO To prevent divide by zero later
     return totalSeconds(endTime - startTime)
Пример #2
0
 def _getDuration(self, start, end):
     startTime = self._getDateTime(start)
     endTime = self._getDateTime(end)
     if startTime >= endTime:
         logger.warning("startTime is after endTime")
         return 1  ##TODO To prevent divide by zero later
     return totalSeconds(endTime - startTime)
Пример #3
0
    def _resubmit_message_if_applicable(self, msg: EventMessage) -> bool:
        if msg and msg.content:
            retry_count = msg.content.get('retry_count', 0)

            if retry_count < min(
                    MAX_NR_OF_RETRIES,
                    127):  # mom can't handle more than 127 status updates...
                retry_count += 1
                try:
                    retry_timestamp = msg.content.get('retry_timestamp',
                                                      datetime.utcnow())
                    ago = totalSeconds(datetime.utcnow() - retry_timestamp)
                    time.sleep(
                        max(0, 2 * retry_count - ago)
                    )  # wait longer between each retry, other messages can be processed in between
                except Exception as e:
                    # just continue
                    logger.warning(e)

                # set/update retry values
                msg.content['retry_count'] = retry_count
                msg.content['retry_timestamp'] = datetime.utcnow()

                # resubmit
                new_msg = EventMessage(subject=msg.subject,
                                       content=msg.content)
                new_msg.ttl = 48 * 3600  # remove message from queue's when not picked up within 48 hours
                logger.info(
                    'resubmitting unhandled message to back of queue %s: %s %s',
                    self.exchange, new_msg.subject,
                    str(new_msg.content).replace('\n', ' '))
                self.send(new_msg)
                return True

        return False
Пример #4
0
 def _getDuration(self, start, end):
     """ Returns number of fractional seconds as a float(!) (as totalSeconds())
         between start and end.
     """
     startTime = parseDatetime(start)
     endTime = parseDatetime(end)
     if startTime >= endTime:
         logger.warning("startTime is not before endTime")
         return 1.0 ##TODO To prevent divide by zero later
     return totalSeconds(endTime - startTime)
Пример #5
0
    def executeQuery(self, query, qargs=None, fetch=FETCH_NONE):
        '''execute the query and reconnect upon OperationalError'''
        try:
            with self._connection.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cursor:
                start = datetime.utcnow()
                cursor.execute(query, qargs)
                if self._log_queries:
                    elapsed = datetime.utcnow() - start
                    elapsed_ms = 1000.0 * totalSeconds(elapsed)
                    logger.info('executed query in %.1fms%s yielding %s rows: %s', elapsed_ms,
                                                                                   ' (SLOW!)' if elapsed_ms > 250 else '', # for easy log grep'ing
                                                                                   cursor.rowcount,
                                                                                   self._queryAsSingleLine(query, qargs))

                try:
                    self._log_database_notifications()

                    result = []
                    if fetch == FETCH_ONE:
                        result = cursor.fetchone()

                    if fetch == FETCH_ALL:
                        result = cursor.fetchall()

                    if self.__auto_commit_selects and re.search('select', query, re.IGNORECASE):
                        #prevent dangling in idle transaction on server
                        self.commit()

                    return result
                except Exception as e:
                    logger.error("error while fetching result(s) for %s: %s", self._queryAsSingleLine(query, qargs), e)

        except (psycopg2.OperationalError, AttributeError) as e:
            logger.error(str(e))
            while self.__connection_retries < 5:
                logger.info("(re)trying to connect to database")
                self.__connection_retries += 1
                self._connect()
                if self._connection:
                    self.__connection_retries = 0
                    return self.executeQuery(query, qargs, fetch)
                time.sleep(i*i)
        except (psycopg2.IntegrityError, psycopg2.ProgrammingError, psycopg2.InternalError, psycopg2.DataError)as e:
            logger.error("Rolling back query=\'%s\' due to error: \'%s\'" % (self._queryAsSingleLine(query, qargs), e))
            self.rollback()
            return []
        except Exception as e:
            logger.error(str(e))

        return []
Пример #6
0
    def _do_execute_query(self, query, qargs=None, fetch=FETCH_NONE):
        '''execute the query and reconnect upon OperationalError'''
        query_log_line = self._queryAsSingleLine(query, qargs)

        try:
            self.connect_if_needed()

            # log
            logger.debug('executing query: %s', query_log_line)

            # execute (and time it)
            start = datetime.utcnow()
            self._cursor.execute(query, qargs)
            elapsed = datetime.utcnow() - start
            elapsed_ms = 1000.0 * totalSeconds(elapsed)

            # log execution result
            logger.info('executed query in %.1fms%s yielding %s rows: %s', elapsed_ms,
                                                                           ' (SLOW!)' if elapsed_ms > 250 else '', # for easy log grep'ing
                                                                           self._cursor.rowcount,
                                                                           query_log_line)

            # log any notifications from within the database itself
            self._log_database_notifications()

            self._commit_selects_if_needed(query)

            # fetch and return results
            if fetch == FETCH_ONE:
                row = self._cursor.fetchone()
                return dict(row) if row is not None else None
            if fetch == FETCH_ALL:
                return [dict(row) for row in self._cursor.fetchall() if row is not None]
            return []

        except psycopg2.OperationalError as oe:
            if self._is_recoverable_connection_error(oe):
                raise PostgresDBConnectionError("Could not execute query due to connection errors. '%s' error=%s" %
                                                (query_log_line,
                                                 single_line_with_single_spaces(oe)))
            else:
                self._log_error_rollback_and_raise(oe, query_log_line)

        except Exception as e:
            self._log_error_rollback_and_raise(e, query_log_line)
Пример #7
0
    def test_resource_usages_performance(self):
        ELAPSED_TRESHOLD = 2.0  #max allowed insert/update/delete time in seconds

        num_resources = self.radb.executeQuery(
            'select count(id) from virtual_instrument.resource;',
            fetch=FETCH_ONE)['count']
        # make sure all resources have 1000 units available
        MAX_CAPACITY = 1000
        self.radb.executeQuery(
            'update resource_monitoring.resource_capacity set (available, total) = (%s, %s);',
            (MAX_CAPACITY, MAX_CAPACITY))

        # pretend that we have an almost unlimited amount of storage space
        self.radb.executeQuery('update resource_monitoring.resource_capacity set (available, total) = (%s, %s) ' \
                                'where resource_id in (select id from virtual_instrument.resource_view where type_name = \'storage\');',
                                (1e9*MAX_CAPACITY,1e9*MAX_CAPACITY))

        # keep a list of storage-type resource(ids), so we can create long lasting claims for these.
        storage_resource_ids = set(
            r['id'] for r in self.radb.getResources(resource_types='storage'))

        now = datetime.utcnow()
        now -= timedelta(minutes=now.minute,
                         seconds=now.second,
                         microseconds=now.microsecond)  # round to full hour
        spec_ids = []
        filename = 'resource_usages_performance%s.csv' % (
            datetime.utcnow().strftime('%Y%m%dT%H%M%S'), )
        with open(filename, 'w') as file:
            file.write(
                '#tasks, #claims, #claims_per_resource, #inserted_claims, elapsed_insert\n'
            )
            counter = 0
            # it is not common to claim a single resource multiple times for the same task, but it can happen, so test for it.
            for preferred_num_claims_per_resource in [1, 2, 5, 10, 20, 50]:
                # let's test over a feasible range of #claims. A lofar observation usually has ~200 claims.
                for num_claims_to_insert in [
                        1, 2, 5, 10, 20, 50, 100, 200, 500
                ]:
                    num_claims_to_insert = min(
                        num_claims_to_insert,
                        preferred_num_claims_per_resource * num_resources)
                    num_claims_per_resource = min(
                        preferred_num_claims_per_resource,
                        num_claims_to_insert)

                    for oversubscription_factor in [1, 999]:
                        counter += 1

                        logger.info(
                            '*****************************************************************'
                        )
                        logger.info(
                            'starting task and claim scheduling: counter=%s num_claims_per_resource=%s num_claims_to_insert=%s oversubscription_factor=%s',
                            counter, num_claims_per_resource,
                            num_claims_to_insert, oversubscription_factor)

                        result = self.radb.insertOrUpdateSpecificationAndTask(
                            counter, counter, 'approved', 'observation',
                            now + timedelta(hours=3 * counter),
                            now + timedelta(hours=3 * counter + 1), 'content',
                            'CEP4')
                        task_id = result['task_id']
                        task = self.radb.getTask(task_id)
                        spec_ids.append(task['specification_id'])

                        claims = [{
                            'resource_id':
                            q / num_claims_per_resource,
                            'starttime':
                            task['starttime'],
                            'endtime':
                            task['endtime'],
                            'status':
                            'tentative',
                            'claim_size':
                            oversubscription_factor * MAX_CAPACITY /
                            num_claims_per_resource
                        } for q in range(num_claims_to_insert)]

                        # extend claims on storage resources
                        for claim in claims:
                            if claim['resource_id'] in storage_resource_ids:
                                claim['endtime'] += timedelta(days=100)

                        start = datetime.utcnow()
                        self.radb.insertResourceClaims(task_id, claims, 'foo',
                                                       1, 1)
                        elapsed_insert = totalSeconds(datetime.utcnow() -
                                                      start)

                        num_tasks = self.radb.executeQuery(
                            'select count(id) from resource_allocation.task;',
                            fetch=FETCH_ONE)['count']
                        num_claims = self.radb.executeQuery(
                            'select count(id) from resource_allocation.resource_claim;',
                            fetch=FETCH_ONE)['count']
                        has_storage_claims = len(
                            self.radb.getResourceClaims(
                                task_ids=task_id, resource_type='storage')) > 0

                        # enforce perfomance criterion: inserting claims should take less than ELAPSED_TRESHOLD sec
                        self.assertLess(
                            elapsed_insert,
                            ELAPSED_TRESHOLD,
                            msg=
                            "insertResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s"
                            % (elapsed_insert, ELAPSED_TRESHOLD, num_tasks,
                               num_claims, num_claims_to_insert,
                               num_claims_per_resource))

                        if oversubscription_factor > 1:
                            # (deliberate) oversubscription of resources
                            # so, expect the claims and task to be in conflict
                            self.assertGreater(
                                len(
                                    self.radb.getResourceClaims(
                                        task_ids=task_id, status='conflict')),
                                0)
                            self.assertEqual(
                                'conflict',
                                self.radb.getTask(task_id)['status'])

                            # solve oversubscription
                            start = datetime.utcnow()
                            self.radb.updateResourceClaims(
                                where_task_ids=task_id,
                                claim_size=MAX_CAPACITY /
                                num_claims_per_resource)
                            elapsed_status_update = totalSeconds(
                                datetime.utcnow() - start)

                            # enforce perfomance criterion: updating claims should take less than ELAPSED_TRESHOLD sec
                            self.assertLess(
                                elapsed_status_update,
                                ELAPSED_TRESHOLD,
                                msg=
                                "updateResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s"
                                % (elapsed_status_update, ELAPSED_TRESHOLD,
                                   num_tasks, num_claims, num_claims_to_insert,
                                   num_claims_per_resource))

                            # check if not oversubscribed anymore
                            self.assertEqual(
                                0,
                                len(
                                    self.radb.getResourceClaims(
                                        task_ids=task_id, status='conflict')))
                            self.assertEqual(
                                'approved',
                                self.radb.getTask(task_id)['status'])

                        # no oversubscription (anymore), so expect all claims to be claimable...
                        start = datetime.utcnow()
                        self.radb.updateTaskAndResourceClaims(
                            task_id=task_id, claim_status='claimed')
                        elapsed_status_update = totalSeconds(
                            datetime.utcnow() - start)

                        # are they indeed claimed?
                        self.assertEqual(
                            num_claims_to_insert,
                            len(
                                self.radb.getResourceClaims(task_ids=task_id,
                                                            status='claimed')))

                        # enforce perfomance criterion: updating claims should take less than 2*ELAPSED_TRESHOLD sec (2* because we update both tasks and claims)
                        self.assertLess(
                            elapsed_status_update,
                            2 * ELAPSED_TRESHOLD,
                            msg=
                            "updateTaskAndResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s"
                            % (elapsed_status_update, ELAPSED_TRESHOLD,
                               num_tasks, num_claims, num_claims_to_insert,
                               num_claims_per_resource))

                        # ... and proceed with cycling through the task status
                        for task_status in [
                                'prescheduled', 'scheduled', 'queued',
                                'active', 'completing', 'finished'
                        ]:
                            # update the task status
                            start = datetime.utcnow()
                            self.radb.updateTaskAndResourceClaims(
                                task_id=task_id, task_status=task_status)
                            elapsed_status_update = totalSeconds(
                                datetime.utcnow() - start)

                            # enforce perfomance criterion: updating task status should take less than  2*ELAPSED_TRESHOLD sec (2* because we update both tasks and claims)
                            self.assertLess(
                                elapsed_status_update,
                                2 * ELAPSED_TRESHOLD,
                                msg=
                                "updateTaskAndResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s task_status=%s"
                                % (elapsed_status_update, ELAPSED_TRESHOLD,
                                   num_tasks, num_claims, num_claims_to_insert,
                                   num_claims_per_resource, task_status))

                            # check task status
                            self.assertEqual(
                                task_status,
                                self.radb.getTask(task_id)['status'])

                        # task should now be finished
                        self.assertEqual('finished',
                                         self.radb.getTask(task_id)['status'])
                        # and all non-long-lasting (storage) claims should be removed.
                        self.assertEqual(
                            0,
                            len(
                                list(c for c in self.radb.getResourceClaims(
                                    task_ids=task_id)
                                     if c['endtime'] <= task['endtime'])))

                        if has_storage_claims:
                            # and all long-lasting (storage) claims should still be there.
                            # (they are removed by  the cleanupservice ending/removing the storage claims)
                            self.assertGreater(
                                len(
                                    list(c
                                         for c in self.radb.getResourceClaims(
                                             task_ids=task_id)
                                         if c['endtime'] > task['endtime'])),
                                0)

                        logger.info(
                            'TEST RESULT: radb now contains %d tasks, %d claims, insert of %d claims with %d claims per resource takes on average %.3fsec',
                            num_tasks, num_claims, num_claims_to_insert,
                            num_claims_per_resource, elapsed_insert)
                        file.write(
                            '%d, %d, %d, %d, %.3f\n' %
                            (num_tasks, num_claims, num_claims_per_resource,
                             num_claims_to_insert, elapsed_insert))
                        file.flush()

            logger.info('removing all test specs/tasks/claims from db')
            delete_elapsed_list = []

            file.write('\n\n#tasks, #claims, elapsed_delete\n')

            for spec_id in spec_ids:
                num_tasks = self.radb.executeQuery(
                    'select count(id) from resource_allocation.task;',
                    fetch=FETCH_ONE)['count']
                num_claims = self.radb.executeQuery(
                    'select count(id) from resource_allocation.resource_claim;',
                    fetch=FETCH_ONE)['count']
                start = datetime.utcnow()
                self.radb.deleteSpecification(spec_id)
                elapsed = totalSeconds(datetime.utcnow() - start)
                delete_elapsed_list.append(elapsed)

                # enforce perfomance criterion: (cascading) delete of spec should take less than ELAPSED_TRESHOLD sec
                self.assertLess(elapsed, ELAPSED_TRESHOLD)

                file.write('%d, %d, %.3f\n' % (num_tasks, num_claims, elapsed))
                file.flush()

        logger.info('average spec delete time: %.3f',
                    sum(delete_elapsed_list) / float(len(delete_elapsed_list)))
        logger.info('Done. Results can be found in file: %s', filename)
Пример #8
0
def main():
    from optparse import OptionParser
    from lofar.common import dbcredentials

    # Check the invocation arguments
    parser = OptionParser(
        "%prog [options]",
        description=
        'execute a performance test by inserting many files on an empty test database.'
    )
    (options, args) = parser.parse_args()

    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        level=logging.INFO)

    with LTAStorageDbTestInstance() as test_db:
        base_date = datetime.utcnow()

        db = store.LTAStorageDb(test_db.dbcreds)

        db.insertSiteIfNotExists('sara', 'srm://srm.siteA.nl:8444')
        rootdir_id = db.insertRootDirectory(
            'sara', '/pnfs/grid.siteA.nl/data/lofar/ops')
        projects_dir_id = db.insertSubDirectory(
            '/pnfs/grid.siteA.nl/data/lofar/ops/projects', rootdir_id)

        total_num_files_inserted = 0

        with open('db_perf.csv', 'w') as file:
            for cycle_nr in range(1, 10):
                for project_nr in range(1, 10):
                    # project_name = 'lc%d_%03d/%d' % (cycle_nr, project_nr, os.getpid())
                    project_name = 'lc%d_%03d' % (cycle_nr, project_nr)
                    projectdir_id = db.insertSubDirectory(
                        '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s' %
                        (project_name, ), projects_dir_id)

                    obs_base_id = cycle_nr * 100000 + project_nr * 1000
                    for obs_nr, obsId in enumerate(
                            range(obs_base_id, obs_base_id + 20)):
                        obsName = 'L%s' % obsId

                        obsdir_id = db.insertSubDirectory(
                            '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s/%s'
                            % (project_name, obsName), projectdir_id)

                        fileinfos = [
                            ('%s_SB%3d' % (obsName, sbNr),
                             1000 + sbNr + project_nr * cycle_nr, base_date +
                             timedelta(days=10 * cycle_nr + project_nr,
                                       minutes=obs_nr,
                                       seconds=sbNr), obsdir_id)
                            for sbNr in range(0, 2)
                        ]
                        now = datetime.utcnow()
                        file_ids = db.insertFileInfos(fileinfos)
                        total_num_files_inserted += len(file_ids)
                        elapsed = totalSeconds(datetime.utcnow() - now)
                        line = '%s,%s' % (total_num_files_inserted, elapsed)
                        print(line)
                        file.write(line + '\n')
Пример #9
0
    def enoughResourcesAvailable(self):
        try:
            now = datetime.utcnow()
            bytes_sent = _getBytesSent()

            if bytes_sent >= self.__prev_bytes_sent:  # bytes_sent might wrap around zero
                # compute current speed in Gbps
                speed = 8 * (bytes_sent - self.__prev_bytes_sent
                             ) / totalSeconds(now -
                                              self.__prev_bytes_sent_timestamp)

                # running average for used_bandwidth
                used_bandwidth = 0.5 * speed + 0.5 * self.__prev_used_bandwidth

                logger.debug("resources: current used_bandwidth = %s",
                             humanreadablesize(used_bandwidth, 'bps'))

                # store for next iteration
                self.__prev_bytes_sent = bytes_sent
                self.__prev_bytes_sent_timestamp = now
                self.__prev_used_bandwidth = used_bandwidth

                # only start new jobs if we have some bandwith available
                # note that this is a 'soft' limit.
                # we cannot control the actual bandwith used by the running transfers
                # we can only not start new jobs if we already exceed the MAX_USED_BANDWITH_TO_START_NEW_JOBS
                if used_bandwidth > MAX_USED_BANDWITH_TO_START_NEW_JOBS:
                    logger.warning(
                        'resources: not enough bandwith available to start new jobs, using %s, max %s'
                        % (humanreadablesize(used_bandwidth, 'bps'),
                           humanreadablesize(
                               MAX_USED_BANDWITH_TO_START_NEW_JOBS, 'bps')))
                    return False
            else:
                # wrapped around 0, just store for next iteration, do not compute anything
                self.__prev_bytes_sent = bytes_sent
                self.__prev_bytes_sent_timestamp = now

            # only start new jobs if we have some cpu time available
            idle_cpu_percentage = psutil.cpu_times_percent().idle
            logger.debug("resources: current idle_cpu_percentage = %s%%",
                         idle_cpu_percentage)
            if idle_cpu_percentage < 5:
                logger.warning(
                    'resources: not enough cpu power available to start new jobs, cpu_idle %s%%'
                    % idle_cpu_percentage)
                return False

            # only start new jobs if system load is not too high
            short_load_avg = os.getloadavg()[0]
            cpu_count = psutil.cpu_count()
            allowed_load = 1.5 * cpu_count
            logger.debug(
                "resources: current short term load = %s #cpu's = %s allowed_load = %s",
                short_load_avg, cpu_count, allowed_load)
            if short_load_avg > allowed_load:
                logger.warning(
                    'resources: system load too high (%s > %s), cannot start new jobs'
                    % (short_load_avg, allowed_load))
                return False

            # only allow 1 job at the time if swapping
            swap_memory_percentage = psutil.swap_memory().percent
            logger.debug("resources: current swap_memory_percentage = %s%%",
                         swap_memory_percentage)
            if swap_memory_percentage > 5 and len(self.__running_jobs) > 0:
                logger.warning(
                    'resources: system swapping. not enough memory available to start new jobs'
                )
                return False

            # only start new jobs if number of processes is not too high
            try:
                current_user = getpass.getuser()
                current_user_procs = [
                    p for p in psutil.process_iter()
                    if p.username() == current_user
                ]
                current_num_user_procs = len(current_user_procs)
                allowed_num_user_procs = 64 * cpu_count

                logger.debug(
                    "resources: current num_user_procs = %s allowed_num_user_procs = %s",
                    current_num_user_procs, allowed_num_user_procs)

                if current_num_user_procs > allowed_num_user_procs:
                    logger.warning(
                        'resources: number of processes by %s too high (%s > %s), cannot start new jobs'
                        % (current_user, current_num_user_procs,
                           allowed_num_user_procs))
                    return False
            except Exception as e:
                logger.exception(e)
                pass

            # limit total number of parallel transferring jobs to self.max_nr_of_parallel_jobs
            with self.__lock:
                starting_threads = [
                    job_thread_dict['thread']
                    for job_thread_dict in list(self.__running_jobs.values())
                    if 'pipeline' not in job_thread_dict
                ]
                pipelines = [
                    job_thread_dict['pipeline']
                    for job_thread_dict in list(self.__running_jobs.values())
                    if 'pipeline' in job_thread_dict
                ]
                initializing_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_INITIALIZING
                ]
                transferring_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_TRANSFERRING
                ]
                finalizing_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_FINALIZING
                ]

                num_busy_transfers = len(starting_threads) + len(
                    initializing_pipelines) + len(transferring_pipelines)
                num_finalizing_transfers = len(finalizing_pipelines)

                logger.debug(
                    "resources: current num_busy_transfers = %s num_finalizing_transfers = %s max_nr_of_parallel_jobs = %s",
                    num_busy_transfers, num_finalizing_transfers,
                    self.max_nr_of_parallel_jobs)

                if num_busy_transfers >= self.max_nr_of_parallel_jobs:
                    logger.warning(
                        'resources: already running %d parallel jobs (#starting=%d, #transferring=%d) limiting the total number of transferring jobs to %d'
                        % (len(self.__running_jobs),
                           len(initializing_pipelines) + len(starting_threads),
                           len(transferring_pipelines),
                           self.max_nr_of_parallel_jobs))
                    return False

                if num_finalizing_transfers >= 2 * self.max_nr_of_parallel_jobs:
                    logger.warning(
                        'resources: already waiting for %d jobs to finish (updating status/SIP to MoM and LTA). not starting new jobs until some jobs finished...'
                        % (len(finalizing_pipelines), ))
                    return False

        except Exception as e:
            logger.exception(
                "error while checking for available resources: %s", e)

            num_running_jobs = len(self.__running_jobs)
            if num_running_jobs <= 4:
                logger.info("running %d jobs, assuming we can run 1 more: ",
                            num_running_jobs)
                return True
            else:
                logger.warning(
                    "already running %d jobs, assuming for safety we cannot run more jobs...",
                    num_running_jobs)
                return False

        return True
Пример #10
0
    def transfer(self, force=False, dereference=False):
        starttime = datetime.utcnow()

        # for cleanup
        self.started_procs = {}
        self.fifos = []

        try:
            if not self.source_exists():
                if self.can_logon_to_source_host():
                    if self.source_mounted():
                        raise LtacpException(
                            "ltacp %s: source path %s:%s does not exist" %
                            (self.logId, self.src_host, self.src_path))
                    else:
                        raise LtacpException(
                            "ltacp %s: the disk of source path %s:%s does not seem to be mounted"
                            % (self.logId, self.src_host, self.src_path))
                else:
                    raise LtacpException(
                        "ltacp %s: cannot login to %s@%s" %
                        (self.logId, self.src_user, self.src_host))

            # determine if input is file
            input_is_file = self.is_soure_single_file()

            if not input_is_file:
                # make sure the file extension is .tar or .tar.gz
                missing_suffix = ""
                if self.gzip:
                    if not (self.dst_surl.endswith(".tar.gz")
                            or self.dst_surl.endswith(".tgz")):
                        if self.dst_surl.endswith(".tar"):
                            missing_suffix = ".gz"
                        else:
                            missing_suffix = ".tar.gz"
                else:
                    if not self.dst_surl.endswith(".tar"):
                        missing_suffix = ".tar"

                if missing_suffix:
                    self.dst_surl += missing_suffix
                    logger.info(
                        "ltacp %s: appending missing suffix %s to surl: %s",
                        self.logId, missing_suffix, self.dst_surl)

            dst_turl = convert_surl_to_turl(self.dst_surl)
            logger.info(
                'ltacp %s: initiating transfer of %s:%s to surl=%s turl=%s' %
                (self.logId, self.src_host, self.src_path, self.dst_surl,
                 dst_turl))

            # get input datasize
            du_items = self.src_path if input_is_file or isinstance(
                self.src_path, str) else ' '.join(self.src_path)
            cmd_remote_du = self.ssh_cmd + [
                'du -b %s --max-depth=0 %s' %
                ("--dereference" if dereference else "", du_items)
            ]
            logger.info('ltacp %s: remote getting datasize. executing: %s' %
                        (self.logId, ' '.join(cmd_remote_du)))
            p_remote_du = Popen(cmd_remote_du, stdout=PIPE, stderr=PIPE)
            self.started_procs[p_remote_du] = cmd_remote_du

            # block until du is finished
            output_remote_du = communicate_returning_strings(p_remote_du)
            del self.started_procs[p_remote_du]
            if p_remote_du.returncode != 0:
                raise LtacpException(
                    'ltacp %s: remote du failed: \nstdout: %s\nstderr: %s' %
                    (self.logId, output_remote_du[0], output_remote_du[1]))

            # compute various parameters for progress logging
            if input_is_file:
                input_datasize = int(output_remote_du[0].split()[0])
            else:
                input_datasize = sum([
                    int(line.strip().split()[0])
                    for line in output_remote_du[0].split('\n')
                    if line.strip()
                ])

            logger.info('ltacp %s: input datasize: %d bytes, %s' %
                        (self.logId, input_datasize,
                         humanreadablesize(input_datasize)))
            estimated_tar_size = 512 * (
                input_datasize // 512
            ) + 3 * 512  #512byte header, 2*512byte ending, 512byte modulo data
            logger.info('ltacp %s: estimated_tar_size: %d bytes, %s' %
                        (self.logId, estimated_tar_size,
                         humanreadablesize(estimated_tar_size)))

            #---
            # Server part
            #---

            # we'll randomize ports
            # to minimize initial collision, randomize based on path and time
            random.seed(hash(self.logId) ^ hash(time.time()))

            p_data_in, port_data = self._ncListen('data')

            self.local_data_fifo = '/tmp/ltacp_datapipe_%s_%s' % (
                self.src_host, self.logId)

            logger.info('ltacp %s: creating data fifo: %s' %
                        (self.logId, self.local_data_fifo))
            if os.path.exists(self.local_data_fifo):
                os.remove(self.local_data_fifo)
            os.mkfifo(self.local_data_fifo)
            if not os.path.exists(self.local_data_fifo):
                raise LtacpException("ltacp %s: Could not create fifo: %s" %
                                     (self.logId, self.local_data_fifo))

            # transfer incomming data stream via md5a32bc to compute md5, adler32 and byte_count
            # data is written to fifo, which is then later fed into globus-url-copy
            # on stdout we can monitor progress
            # set progress message step 0f 0.5% of estimated_tar_size
            cmd_md5a32bc = [
                'md5a32bc', '-p',
                str(min(1000000, estimated_tar_size // 200)),
                self.local_data_fifo
            ]
            logger.info(
                'ltacp %s: processing data stream for md5, adler32 and byte_count. executing: %s'
                % (
                    self.logId,
                    ' '.join(cmd_md5a32bc),
                ))
            p_md5a32bc = Popen(cmd_md5a32bc,
                               stdin=p_data_in.stdout,
                               stdout=PIPE,
                               stderr=PIPE,
                               universal_newlines=True)
            self.started_procs[p_md5a32bc] = cmd_md5a32bc

            # start copy fifo stream to globus-url-copy
            guc_options = [
                '-cd',  #create remote directories if missing
                '-p 4',  #number of parallel ftp connections
                '-bs 131072',  #buffer size
                '-b',  # binary
                '-nodcau',  # turn off data channel authentication for ftp transfers
            ]
            cmd_data_out = [
                '/bin/bash', '-c',
                'globus-url-copy %s file://%s %s' %
                (' '.join(guc_options), self.local_data_fifo, dst_turl)
            ]
            logger.info(
                'ltacp %s: copying data stream into globus-url-copy. executing: %s'
                % (self.logId, ' '.join(cmd_data_out)))
            p_data_out = Popen(cmd_data_out, stdout=PIPE, stderr=PIPE)
            self.started_procs[p_data_out] = cmd_data_out

            # Check if receiver side is set up correctly
            # and all processes are still waiting for input from client
            finished_procs = dict(
                (p, cl) for (p, cl) in list(self.started_procs.items())
                if p.poll() is not None)

            if len(finished_procs):
                msg = ''
                for p, cl in list(finished_procs.items()):
                    o, e = communicate_returning_strings(p)
                    msg += "  process pid:%d exited prematurely with exit code %d. cmdline: %s\nstdout: %s\nstderr: %s\n" % (
                        p.pid, p.returncode, cl, o, e)
                raise LtacpException(
                    "ltacp %s: %d local process(es) exited prematurely\n%s" %
                    (self.logId, len(finished_procs), msg))

            #---
            # Client part
            #---

            # start remote copy on src host:
            # 1) create fifo
            # 2) send tar stream of data/dir + tee to fifo for 3)
            # 3) simultaneously to 2), calculate checksum of fifo stream
            # 4) break fifo

            self.remote_data_fifo = '/tmp/ltacp_md5_pipe_%s' % (self.logId, )
            #make sure there is no old remote fifo
            self._removeRemoteFifo()
            cmd_remote_mkfifo = self.ssh_cmd + [
                'mkfifo %s' % (self.remote_data_fifo, )
            ]
            logger.info('ltacp %s: remote creating fifo. executing: %s' %
                        (self.logId, ' '.join(cmd_remote_mkfifo)))
            p_remote_mkfifo = Popen(cmd_remote_mkfifo,
                                    stdout=PIPE,
                                    stderr=PIPE)
            self.started_procs[p_remote_mkfifo] = cmd_remote_mkfifo

            # block until fifo is created
            output_remote_mkfifo = communicate_returning_strings(
                p_remote_mkfifo)
            del self.started_procs[p_remote_mkfifo]
            if p_remote_mkfifo.returncode != 0:
                raise LtacpException(
                    'ltacp %s: remote fifo creation failed: \nstdout: %s\nstderr: %s'
                    % (self.logId, output_remote_mkfifo[0],
                       output_remote_mkfifo[1]))

            with open(os.devnull, 'r') as devnull:
                # start sending remote data, tee to fifo
                if input_is_file:
                    cmd_remote_data = self.ssh_cmd + [
                        'cat %s %s| tee %s | %s %s %s' %
                        (self.src_path, '| gzip --stdout --fast '
                         if self.gzip else '', self.remote_data_fifo,
                         self.remoteNetCatCmd, self.localIPAddress, port_data)
                    ]
                else:
                    if isinstance(self.src_path, str):
                        #src_path is dir
                        src_path_parent, src_path_child = os.path.split(
                            self.src_path)
                    else:
                        #src_path is list if paths
                        dirs = set([os.path.dirname(p) for p in self.src_path])

                        if len(dirs) > 1:
                            raise LtacpException(
                                'ltacp %s: cannot combine multiple files from different directories in one tarbal'
                                % self.logId)

                        files = set(
                            [os.path.basename(p) for p in self.src_path])
                        src_path_parent = list(dirs)[0]
                        src_path_child = ' '.join(files)

                    cmd_remote_data = self.ssh_cmd + [
                        'cd %s && tar c %s -O %s %s| tee %s | %s %s %s' %
                        (src_path_parent, src_path_child, '--dereference'
                         if dereference else '', '| gzip --stdout --fast '
                         if self.gzip else '', self.remote_data_fifo,
                         self.remoteNetCatCmd, self.localIPAddress, port_data)
                    ]
                logger.info(
                    'ltacp %s: remote starting transfer. executing: %s' %
                    (self.logId, ' '.join(cmd_remote_data)))
                p_remote_data = Popen(cmd_remote_data,
                                      stdin=devnull,
                                      stdout=PIPE,
                                      stderr=PIPE)
                self.started_procs[p_remote_data] = cmd_remote_data

                # start computation of checksum on remote fifo stream
                cmd_remote_checksum = self.ssh_cmd + [
                    'md5sum %s' % (self.remote_data_fifo, )
                ]
                logger.info(
                    'ltacp %s: remote starting computation of md5 checksum. executing: %s'
                    % (self.logId, ' '.join(cmd_remote_checksum)))
                p_remote_checksum = Popen(cmd_remote_checksum,
                                          stdin=devnull,
                                          stdout=PIPE,
                                          stderr=PIPE)
                self.started_procs[p_remote_checksum] = cmd_remote_checksum

                # waiting for output, comparing checksums, etc.
                logger.info(
                    'ltacp %s: transfering... waiting for progress...' %
                    self.logId)
                transfer_start_time = datetime.utcnow()
                prev_progress_time = datetime.utcnow()
                prev_bytes_transfered = 0

                with PipeReader(p_md5a32bc.stdout, self.logId) as pipe_reader:
                    # wait and poll for progress while all processes are runnning
                    while len([
                            p for p in list(self.started_procs.keys())
                            if p.poll() is not None
                    ]) == 0:
                        try:
                            current_progress_time = datetime.utcnow()
                            elapsed_secs_since_prev = totalSeconds(
                                current_progress_time - prev_progress_time)

                            if elapsed_secs_since_prev > 900:
                                raise LtacpException(
                                    'ltacp %s: transfer stalled for 15min.' %
                                    (self.logId))

                            # read and process md5a32bc stdout lines to create progress messages
                            lines = pipe_reader.readlines(1)
                            nextline = lines[-1].strip() if lines else ''

                            if len(nextline) > 0:
                                try:
                                    logger.debug('ltacp %s: transfering... %s',
                                                 self.logId, nextline)
                                    total_bytes_transfered = int(
                                        nextline.split()[0].strip())
                                    percentage_done = (
                                        100.0 * float(total_bytes_transfered)
                                    ) / float(estimated_tar_size)
                                    elapsed_secs_since_start = totalSeconds(
                                        current_progress_time -
                                        transfer_start_time)
                                    if percentage_done > 0 and elapsed_secs_since_start > 0 and elapsed_secs_since_prev > 0:
                                        avg_speed = total_bytes_transfered / elapsed_secs_since_start
                                        current_bytes_transfered = total_bytes_transfered - prev_bytes_transfered
                                        current_speed = current_bytes_transfered / elapsed_secs_since_prev
                                        if elapsed_secs_since_prev > 120 or current_bytes_transfered > 0.333 * estimated_tar_size:
                                            prev_progress_time = current_progress_time
                                            prev_bytes_transfered = total_bytes_transfered
                                            percentage_to_go = 100.0 - percentage_done
                                            time_to_go = elapsed_secs_since_start * percentage_to_go / percentage_done

                                            try:
                                                if self.progress_callback:
                                                    self.progress_callback(
                                                        percentage_done=
                                                        percentage_done,
                                                        current_speed=
                                                        current_speed,
                                                        total_bytes_transfered=
                                                        total_bytes_transfered)
                                            except Exception as e:
                                                logger.error(e)

                                            logger.info(
                                                'ltacp %s: transfered %s %.1f%% in %s at avgSpeed=%s (%s) curSpeed=%s (%s) to_go=%s to %s'
                                                %
                                                (self.logId,
                                                 humanreadablesize(
                                                     total_bytes_transfered),
                                                 percentage_done,
                                                 timedelta(seconds=int(
                                                     round(
                                                         elapsed_secs_since_start
                                                     ))),
                                                 humanreadablesize(
                                                     avg_speed, 'Bps'),
                                                 humanreadablesize(
                                                     avg_speed * 8, 'bps'),
                                                 humanreadablesize(
                                                     current_speed, 'Bps'),
                                                 humanreadablesize(
                                                     current_speed * 8, 'bps'),
                                                 timedelta(seconds=int(
                                                     round(time_to_go))),
                                                 dst_turl))
                                except Exception as e:
                                    msg = 'ltacp %s: error while parsing md5a32bc loglines: error=%s. line=%s' % (
                                        self.logId, e, nextline)
                                    logger.error(msg)
                                    self.cleanup()
                                    raise LtacpException(msg)
                            time.sleep(0.05)
                        except KeyboardInterrupt:
                            self.cleanup()
                        except LtacpException as e:
                            logger.error('ltacp %s: %s' % (self.logId, str(e)))
                            self.cleanup()
                            raise
                        except Exception as e:
                            logger.error('ltacp %s: %s' % (self.logId, str(e)))

                def waitForSubprocess(proc,
                                      timeout=timedelta(seconds=60),
                                      proc_log_name='',
                                      loglevel=logging.DEBUG):
                    logger.log(
                        loglevel,
                        'ltacp %s: waiting at most %s for %s to finish...',
                        self.logId, timeout, proc_log_name)
                    start_wait = datetime.now()
                    while datetime.now() - start_wait < timeout:
                        if proc.poll() is not None:
                            break
                        time.sleep(1)

                    if proc.poll() is None:
                        raise LtacpException(
                            'ltacp %s: %s did not finish within %s.' %
                            (self.logId, proc_log_name, timeout))

                waitForSubprocess(p_data_out,
                                  timedelta(seconds=self.globus_timeout),
                                  'globus-url-copy', logging.INFO)
                output_data_out = communicate_returning_strings(p_data_out)
                if p_data_out.returncode != 0:
                    if 'file exist' in output_data_out[1].lower():
                        raise LtacpDestinationExistsException(
                            'ltacp %s: data transfer via globus-url-copy to LTA failed, file already exists. turl=%s.'
                            % (self.logId, dst_turl))
                    raise LtacpException(
                        'ltacp %s: transfer via globus-url-copy to LTA failed. turl=%s error=%s'
                        % (self.logId, dst_turl, output_data_out[0].strip() +
                           output_data_out[1].strip()))
                logger.info(
                    'ltacp %s: data transfer via globus-url-copy to LTA complete.'
                    % self.logId)

                waitForSubprocess(p_remote_data, timedelta(seconds=60),
                                  'remote data transfer')
                output_remote_data = communicate_returning_strings(
                    p_remote_data)
                if p_remote_data.returncode != 0:
                    raise LtacpException(
                        'ltacp %s: Error in remote data transfer: %s' %
                        (self.logId, output_remote_data[1]))
                logger.debug('ltacp %s: remote data transfer finished...' %
                             self.logId)

                waitForSubprocess(p_remote_checksum, timedelta(seconds=60),
                                  'remote md5 checksum computation')
                output_remote_checksum = communicate_returning_strings(
                    p_remote_checksum)
                if p_remote_checksum.returncode != 0:
                    raise LtacpException(
                        'ltacp %s: Error in remote md5 checksum computation: %s'
                        % (self.logId, output_remote_checksum[1]))
                logger.debug(
                    'ltacp %s: remote md5 checksum computation finished.' %
                    self.logId)

                waitForSubprocess(
                    p_md5a32bc, timedelta(seconds=60),
                    'local computation of md5 adler32 and byte_count')
                output_md5a32bc_local = communicate_returning_strings(
                    p_md5a32bc)
                if p_md5a32bc.returncode != 0:
                    raise LtacpException(
                        'ltacp %s: Error while computing md5 adler32 and byte_count: %s'
                        % (self.logId, output_md5a32bc_local[1]))
                logger.debug(
                    'ltacp %s: computed local md5 adler32 and byte_count.' %
                    self.logId)

                # process remote md5 checksums
                try:
                    md5_checksum_remote = output_remote_checksum[0].split(
                        ' ')[0]
                except Exception as e:
                    logger.error(
                        'ltacp %s: error while parsing remote md5: %s\n%s' %
                        (self.logId, output_remote_checksum[0],
                         output_remote_checksum[1]))
                    raise

                # process local md5 adler32 and byte_count
                try:
                    items = output_md5a32bc_local[1].splitlines()[-1].split(
                        ' ')
                    md5_checksum_local = items[0].strip()
                    a32_checksum_local = items[1].strip().zfill(8)
                    byte_count = int(items[2].strip())
                except Exception as e:
                    logger.error(
                        'ltacp %s: error while parsing md5 adler32 and byte_count outputs: %s'
                        % (self.logId, output_md5a32bc_local[0]))
                    raise

                # check transfered number of bytes
                if (byte_count == 0 and input_datasize > 0):
                    raise LtacpException(
                        'ltacp %s: did not transfer any bytes of the expected %s. Something is wrong in the datastream setup.'
                        % (self.logId, humanreadablesize(estimated_tar_size)))

                logger.info(
                    'ltacp %s: byte count of datastream is %d %s' %
                    (self.logId, byte_count, humanreadablesize(byte_count)))

                # compare local and remote md5
                if (md5_checksum_remote != md5_checksum_local):
                    raise LtacpException(
                        'md5 checksum reported by client (%s) does not match local checksum of incoming data stream (%s)'
                        %
                        (self.logId, md5_checksum_remote, md5_checksum_local))
                logger.info(
                    'ltacp %s: remote and local md5 checksums are equal: %s' %
                    (
                        self.logId,
                        md5_checksum_local,
                    ))

            logger.info('ltacp %s: fetching adler32 checksum from LTA...' %
                        self.logId)
            srm_ok, srm_file_size, srm_a32_checksum = get_srm_size_and_a32_checksum(
                self.dst_surl, 'ltacp %s:' % self.logId)

            if not srm_ok:
                raise LtacpException(
                    'ltacp %s: Could not get srm adler32 checksum for: %s' %
                    (self.logId, self.dst_surl))

            if srm_a32_checksum != a32_checksum_local:
                raise LtacpException(
                    'ltacp %s: adler32 checksum reported by srm (%s) does not match original data checksum (%s)'
                    % (self.logId, srm_a32_checksum, a32_checksum_local))

            logger.info('ltacp %s: adler32 checksums are equal: %s' % (
                self.logId,
                a32_checksum_local,
            ))

            if int(srm_file_size) != int(byte_count):
                raise LtacpException(
                    'ltacp %s: file size reported by srm (%s) does not match datastream byte count (%s)'
                    % (self.logId, srm_file_size, byte_count))

            logger.info(
                'ltacp %s: srm file size and datastream byte count are equal: %s bytes (%s)'
                %
                (self.logId, srm_file_size, humanreadablesize(srm_file_size)))
            logger.info('ltacp %s: transfer to LTA completed successfully.' %
                        (self.logId))

        except LtacpDestinationExistsException as e:
            logger.log(logging.WARN if force else logging.ERROR, str(e))
            if force:
                self.cleanup()
                srmrm(self.dst_surl, 'ltacp %s ' % self.logId)
                return self.transfer(force=False)
            else:
                # re-raise the exception to the caller
                raise
        except Exception as e:
            # Something went wrong
            logger.error('ltacp %s: Error in transfer: %s' %
                         (self.logId, str(e)))
            # re-raise the exception to the caller
            raise
        finally:
            # cleanup
            self.cleanup()

        total_time = round(
            10 * totalSeconds(datetime.utcnow() - starttime)) / 10
        logger.info(
            'ltacp %s: successfully completed transfer of %s:%s to %s in %ssec for %s at avg speed of %s or %s',
            self.logId, self.src_host, self.src_path, self.dst_surl,
            total_time, humanreadablesize(int(srm_file_size)),
            humanreadablesize(int(srm_file_size) / total_time, 'Bps'),
            humanreadablesize(8 * int(srm_file_size) / total_time, 'bps'))

        return (md5_checksum_local, a32_checksum_local, str(byte_count))