def rescheduleVisit(): for i in range(5): try: with store.LTAStorageDb(self.dbcreds) as db: logger.info('Rescheduling %s for new visit.' % (location.path(),)) db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL + datetime.timedelta(mins=1)) break except: time.sleep(1)
def _mark_directory_for_a_visit(self, dir_id): """ update the directory's last visit time to unix-epoch (which is the lowest possible visit timestamp), so that it appears in the visitStats which are used by the scraper to determine the next directory to be visited. :param int dir_id: the id of the directory :return: None """ with store.LTAStorageDb(self._dbcreds) as db: return db.updateDirectoryLastVisitTime(dir_id, datetime.fromtimestamp(0))
def _insert_missing_directory_tree_if_needed(self, srm_url): # example url: srm://lofar-srm.fz-juelich.de:8443/pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884/L652884_SAP000_B000_P001_bf_e619e5da.tar # or for a dir: srm://lofar-srm.fz-juelich.de:8443/pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884 # site_url then becomes: srm://lofar-srm.fz-juelich.de:8443 # dir_path then becomes: /pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884 site = self._get_site_from_db(srm_url) dir_path = get_dir_path_in_site(srm_url) with store.LTAStorageDb(self._dbcreds) as db: return db.insert_missing_directory_tree_if_needed( dir_path, site['id'])
def _get_site_from_db(self, srm_url): """ find the site entry in the database for the given srm_url. raises a lookup error if not found. :param string srm_url: a valid srm url :return: a site entry dict from the database """ site_url = get_site_surl(srm_url) # find site in db with store.LTAStorageDb(self._dbcreds) as db: site = next((s for s in db.sites() if s['url'] == site_url), None) if site is None: raise LookupError('Could not find site %s in database %s' % (site_url, self._dbcreds.database)) return site
def _schedule_srmurl_for_visit(self, srm_url): """process the given srm_url, insert it in the db if needed, and mark it as not visited, so that the scraper will visit it soon. :param srm_url: a valid srm url like: srm://lofar-srm.fz-juelich.de:8443/pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884/L652884_SAP000_B000_P001_bf_e619e5da.tar :return: None """ if srm_url: with store.LTAStorageDb(self._dbcreds) as db: site = self._get_site_from_db(srm_url) dir_path = get_dir_path_in_site(srm_url) directory = db.directoryByName(dir_path, site['id']) if directory is None: dir_id = self._insert_missing_directory_tree_if_needed( srm_url).get(dir_path) else: dir_id = directory.get('dir_id') if dir_id is not None: self._mark_directory_for_a_visit(dir_id)
def main(): from optparse import OptionParser from lofar.common import dbcredentials # Check the invocation arguments parser = OptionParser( "%prog [options]", description= 'runs the lta scraper and stores results in the speficied database.') parser.add_option_group(dbcredentials.options_group(parser)) parser.set_defaults(dbcredentials="LTASO") (options, args) = parser.parse_args() logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO) dbcreds = dbcredentials.parse_options(options) logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword()) global db db = store.LTAStorageDb(dbcreds) app.run(debug=False, host='0.0.0.0', port=9632)
def main(): '''the main function scanning all locations and gathering the results''' from optparse import OptionParser from lofar.common import dbcredentials from lofar.messaging import DEFAULT_BROKER, DEFAULT_BUSNAME from lofar.lta.ltastorageoverview.ingesteventhandler import LTASOIngestEventHandler, IngestEventMesssageBusListener # Check the invocation arguments parser = OptionParser("%prog [options]", description='runs the lta scraper and stores results in the speficied database.') parser.add_option('-j', '--parallel', dest='parallel', type='int', default=8, help='number of parallel srmls jobs to run, default: %default') parser.add_option('-b', '--broker', dest='broker', type='string', default=DEFAULT_BROKER, help='Address of the messaging broker, default: %default') parser.add_option('-e', '--exchange', dest='exchange', type='string', default=DEFAULT_BUSNAME, help='Name of the bus exchange on the broker on which the ingest notifications are published, default: %default') parser.add_option('-V', '--verbose', dest='verbose', action='store_true', help='verbose logging') parser.add_option_group(dbcredentials.options_group(parser)) parser.set_defaults(dbcredentials="LTASO") (options, args) = parser.parse_args() logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG if options.verbose else logging.INFO) options.parallel = max(1, min(8*multiprocessing.cpu_count(), options.parallel)) logger.info("Using maximum number of parallel srmls jobs: %d" % options.parallel) dbcreds = dbcredentials.parse_options(options) logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword()) db = store.LTAStorageDb(dbcreds) populateDbWithLTASitesAndRootDirs(db) # for each site we want one or more ResultGetterThreads # so make a dict with a list per site based on the locations getters = dict([(site['name'],[]) for site in db.sites()]) # some helper functions def numLocationsInQueues(): '''returns the total number of locations in the queues''' return db.numDirectoriesNotVisitedSince(datetime.datetime.utcnow() - VISIT_INTERVAL) def totalNumGetters(): '''returns the total number of parallel running ResultGetterThreads''' return sum([len(v) for v in list(getters.values())]) def cleanupFinishedGetters(): # get rid of old finished ResultGetterThreads finishedGetters = dict([(site_name, [getter for getter in getterList if not getter.isAlive()]) for site_name, getterList in list(getters.items())]) for site_name,finishedGetterList in list(finishedGetters.items()): for finishedGetter in finishedGetterList: getters[site_name].remove(finishedGetter) # the main loop # loop over the locations and spawn ResultGetterThreads to get the results parallel # use load balancing over the different sites and with respect to queue lengths # do not overload this host system with IngestEventMesssageBusListener(handler_type=LTASOIngestEventHandler, handler_kwargs={'dbcreds': dbcreds}, exchange=options.exchange, broker=options.broker): while True: cleanupFinishedGetters() # spawn new ResultGetterThreads # do not overload this host system num_waiting = numLocationsInQueues() while (num_waiting > 0 and totalNumGetters() < options.parallel and os.getloadavg()[0] < 4*multiprocessing.cpu_count()): sitesStats = db.visitStats(datetime.datetime.utcnow() - VISIT_INTERVAL) for site_name, site_stats in list(sitesStats.items()): numGetters = len(getters[site_name]) queue_length = site_stats['queue_length'] weight = float(queue_length) / float(20 * (numGetters + 1)) if numGetters == 0 and queue_length > 0: weight = 1e6 # make getterless sites extra important, so each site keeps flowing site_stats['# get'] = numGetters site_stats['weight'] = weight totalWeight = max(1.0, sum([site_stats['weight'] for site_stats in list(sitesStats.values())])) logger.debug("siteStats:\n%s" % str('\n'.join([str((k, v)) for k, v in list(sitesStats.items())]))) # now pick a random site using the weights chosen_site_name = None cumul = 0.0 r = random() for site_name,site_stats in list(sitesStats.items()): ratio = site_stats['weight']/totalWeight cumul += ratio if r <= cumul and site_stats['queue_length'] > 0: chosen_site_name = site_name break if not chosen_site_name: break chosen_dir_id = sitesStats[chosen_site_name]['least_recent_visited_dir_id'] db.updateDirectoryLastVisitTime(chosen_dir_id, datetime.datetime.utcnow()) logger.debug("chosen_site_name: %s chosen_dir_id: %s", chosen_site_name, chosen_dir_id) # make and start a new ResultGetterThread the location deque of the chosen site newGetter = ResultGetterThread(dbcreds, chosen_dir_id) newGetter.start() getters[chosen_site_name].append(newGetter) cleanupFinishedGetters() # refresh num_waiting num_waiting = numLocationsInQueues() logger.info('numLocationsInQueues=%d totalNumGetters=%d siteQueueLengths: %s load_5min: %.1f' % (num_waiting, totalNumGetters(), ' '.join(['%s:%d' % (name, stats['queue_length']) for name, stats in list(sitesStats.items())]), os.getloadavg()[0])) # sleep before main loop next iteration # to wait for some results # and some getters to finish time.sleep(30 if num_waiting <= options.parallel else 0.25)
def run(self): '''A single location is pop\'ed from the locations deque and the results are queried. Resulting subdirectories are appended to the locations deque''' try: with store.LTAStorageDb(self.dbcreds) as db: dir = db.directory(self.dir_id) if not dir: return dir_id = dir['dir_id'] dir_name = dir['dir_name'] site_id = dir['site_id'] site = db.site(site_id) srm_url = site['url'] location = Location(srm_url, dir_name) try: def rescheduleVisit(): for i in range(5): try: with store.LTAStorageDb(self.dbcreds) as db: logger.info('Rescheduling %s for new visit.' % (location.path(),)) db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL + datetime.timedelta(mins=1)) break except: time.sleep(1) # get results... long blocking result = location.getResult() logger.info(result) with store.LTAStorageDb(self.dbcreds) as db: # convert the result.files list into a dict #with (filename, dir_id) as key and a tuple with all file info as value result_file_tuple_dict = {} for file in result.files: filename = file.filename.split('/')[-1] key = (filename, dir_id) file_tuple = (filename, int(file.size), file.created_at, dir_id) result_file_tuple_dict[key] = file_tuple # create a dict of all already known files from the db known_file_dict = {} for file in db.filesInDirectory(dir_id): key = (str(file['name']), dir_id) known_file_dict[key] = file # now compare the result and known (filename, dir_id) sets # and find out which a new, and which are known. # compare only by (filename, dir_id) because for a given file the size and/or date might have changed, # but that does not make it a new/unique file. result_file_key_set = set(result_file_tuple_dict.keys()) known_file_key_set = set(known_file_dict.keys()) new_file_key_set = result_file_key_set - known_file_key_set removed_file_key_set = known_file_key_set - result_file_key_set logger.info("%s %s: %d out of %d files are new, and %d are already known", site['name'], dir_name, len(new_file_key_set), len(result_file_key_set), len(known_file_key_set)) if new_file_key_set: new_file_tuple_set = [result_file_tuple_dict[key] for key in new_file_key_set] file_ids = db.insertFileInfos(new_file_tuple_set) if len(file_ids) != len(new_file_tuple_set): rescheduleVisit() if known_file_key_set: for key, known_file in list(known_file_dict.items()): if key in result_file_tuple_dict: result_file_tuple = result_file_tuple_dict[key] known_size = int(known_file['size']) result_size = result_file_tuple[1] if known_size != result_size: logger.info("%s %s: updating %s (id=%d) size from %d to %d", site['name'], dir_name, known_file['name'], known_file['id'], known_size, result_size) db.updateFileInfoSize(known_file['id'], result_size) if removed_file_key_set: for removed_file_key in removed_file_key_set: db.deleteFileInfoFromDirectory(removed_file_key[0], removed_file_key[1]) # skip empty nikhef dirs filteredSubDirectories = [loc for loc in result.subDirectories if not ('nikhef' in loc.srmurl and 'generated' in loc.directory) ] # skip sksp spectroscopy project filteredSubDirectories = [loc for loc in filteredSubDirectories if not ('sara' in loc.srmurl and 'sksp' in loc.directory and 'spectro' in loc.directory) ] subDirectoryNames = [loc.directory for loc in filteredSubDirectories] if subDirectoryNames: #check for already known subdirectories in the db known_subDirectoryNames_set = set(subdir['name'] for subdir in db.subDirectories(dir_id)) new_subdir_name_set = set(subDirectoryNames) - known_subDirectoryNames_set; logger.info("%s %s: %d out of %d subdirs are new, and %d are already known", site['name'], dir_name, len(new_subdir_name_set), len(subDirectoryNames), len(known_subDirectoryNames_set)) if new_subdir_name_set: subdir_ids = db.insertSubDirectories(new_subdir_name_set, dir_id) if len(subdir_ids) != len(new_subdir_name_set): rescheduleVisit() except (SrmlsException, ParseException) as e: logger.error('Error while scanning %s\n%s' % (location.path(), str(e))) if 'does not exist' in str(e): with store.LTAStorageDb(self.dbcreds) as db: db.deleteDirectory(self.dir_id) else: rescheduleVisit() except Exception as e: logger.exception(str(e)) with store.LTAStorageDb(self.dbcreds) as db: logger.info('Rescheduling dir_id %d for new visit.' % (self.dir_id,)) db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL)
def create_database_connection(self) -> store.LTAStorageDb: return store.LTAStorageDb(self.dbcreds)
def main(): from optparse import OptionParser from lofar.common import dbcredentials # Check the invocation arguments parser = OptionParser( "%prog [options]", description= 'execute a performance test by inserting many files on an empty test database.' ) (options, args) = parser.parse_args() logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO) with LTAStorageDbTestInstance() as test_db: base_date = datetime.utcnow() db = store.LTAStorageDb(test_db.dbcreds) db.insertSiteIfNotExists('sara', 'srm://srm.siteA.nl:8444') rootdir_id = db.insertRootDirectory( 'sara', '/pnfs/grid.siteA.nl/data/lofar/ops') projects_dir_id = db.insertSubDirectory( '/pnfs/grid.siteA.nl/data/lofar/ops/projects', rootdir_id) total_num_files_inserted = 0 with open('db_perf.csv', 'w') as file: for cycle_nr in range(1, 10): for project_nr in range(1, 10): # project_name = 'lc%d_%03d/%d' % (cycle_nr, project_nr, os.getpid()) project_name = 'lc%d_%03d' % (cycle_nr, project_nr) projectdir_id = db.insertSubDirectory( '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s' % (project_name, ), projects_dir_id) obs_base_id = cycle_nr * 100000 + project_nr * 1000 for obs_nr, obsId in enumerate( range(obs_base_id, obs_base_id + 20)): obsName = 'L%s' % obsId obsdir_id = db.insertSubDirectory( '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s/%s' % (project_name, obsName), projectdir_id) fileinfos = [ ('%s_SB%3d' % (obsName, sbNr), 1000 + sbNr + project_nr * cycle_nr, base_date + timedelta(days=10 * cycle_nr + project_nr, minutes=obs_nr, seconds=sbNr), obsdir_id) for sbNr in range(0, 2) ] now = datetime.utcnow() file_ids = db.insertFileInfos(fileinfos) total_num_files_inserted += len(file_ids) elapsed = totalSeconds(datetime.utcnow() - now) line = '%s,%s' % (total_num_files_inserted, elapsed) print(line) file.write(line + '\n')