def db_import_filter_params_cmd(cruise, filter_file, db_file): """ Imports filter parameters to database. File paths must be new-style datestamped paths. Any part of the file path except for the filename will be ignored. The filename may include a '.gz' extension. If an entry can't be found for the specified cruise this command will exit with a non-zero exit status. """ # If cruise not supplied, try to get from db if cruise is None: try: cruise = db.get_cruise(db_file) except SeaFlowpyError: pass if cruise is None: raise click.ClickException( 'cruise must be specified either as command-line option or in database metadata table.' ) defaults = {"sep": str(','), "na_filter": True, "encoding": "utf-8"} df = pd.read_csv(filter_file, **defaults) df.columns = [c.replace('.', '_') for c in df.columns] params = df[df.cruise == cruise] if len(params.index) == 0: raise click.ClickException('no filter parameters found for cruise %s' % cruise) db.save_filter_params(db_file, params.to_dict('index').values())
def db_import_filter_params_cmd(cruise, filter_file, db_file): """ Imports filter parameters to database. A new database will be created if it doesn't exist. """ # If cruise not supplied, try to get from db if cruise is None: try: cruise = db.get_cruise(db_file) except SeaFlowpyError: pass if cruise is None: raise click.ClickException( 'cruise must be specified either as command-line option or in database metadata table.' ) defaults = {"sep": str(','), "na_filter": True, "encoding": "utf-8"} df = pd.read_csv(filter_file, **defaults) df.columns = [c.replace('.', '_') for c in df.columns] params = df[df.cruise == cruise] if len(params.index) == 0: raise click.ClickException('no filter parameters found for cruise %s' % cruise) db.save_filter_params(db_file, params.to_dict('index').values())
def db_import_sfl_cmd(cruise, force, json, serial, verbose, sfl_file, db_file): """ Imports SFL metadata to database. Writes processed SFL-FILE data to SQLite3 database file. Data will be checked before inserting. If any errors are found the first of each type will be reported and no data will be written. To read from STDIN use '-' for SFL-FILE. SFL-FILE should have the <cruise name> and <instrument serial> embedded in the filename as '<cruise name>_<instrument serial>.sfl'. If not, specify as options. If a database file does not exist a new one will be created. Errors or warnings are output to STDOUT. """ if sfl_file is not sys.stdin: # Try to read cruise and serial from filename results = sfl.parse_sfl_filename(sfl_file.name) if results: if cruise is None: cruise = results[0] if serial is None: serial = results[1] # Try to read cruise and serial from database if not already defined if cruise is None: try: cruise = db.get_cruise(db_file) except SeaFlowpyError as e: pass if serial is None: try: serial = db.get_serial(db_file) except SeaFlowpyError as e: pass # Make sure cruise and serial are defined somewhere if cruise is None or serial is None: raise click.ClickException( 'instrument serial and cruise must both be specified either in filename as <cruise>_<instrument-serial>.sfl, as command-line options, or in database metadata table.' ) df = sfl.read_file(sfl_file) df = sfl.fix(df) errors = sfl.check(df) if len(errors) > 0: if json: sfl.print_json_errors(errors, sys.stdout, print_all=verbose) else: sfl.print_tsv_errors(errors, sys.stdout, print_all=verbose) if not force and len([e for e in errors if e["level"] == "error"]) > 0: sys.exit(1) sfl.save_to_db(df, db_file, cruise, serial)
def local_filter_evt_cmd(evt_dir, s3_flag, dbpath, limit, opp_dir, process_count, resolution): """Filter EVT data locally.""" # Validate args if not evt_dir and not s3_flag: raise click.UsageError('One of --evt_dir or --s3 must be provided') # Find cruise in db try: cruise = db.get_cruise(dbpath) except errors.SeaFlowpyError as e: raise click.ClickException(str(e)) # Find filter parameters in db. Won't use them yet but better to check # upfront try: _filter_params = db.get_latest_filter(dbpath) except errors.SeaFlowpyError as e: raise click.ClickException(str(e)) # Capture run parameters and information v = { 'evt_dir': evt_dir, 's3': s3_flag, 'limit': limit, 'db': dbpath, 'opp_dir': opp_dir, 'process_count': process_count, 'resolution': resolution, 'version': pkg_resources.get_distribution("seaflowpy").version, 'cruise': cruise } to_delete = [k for k in v if v[k] is None] for k in to_delete: v.pop(k, None) # Remove undefined parameters # Print run parameters print('Run parameters and information:') print(json.dumps(v, indent=2)) print('') # Get list of files in sfl table. try: sfl_df = db.get_sfl_table(dbpath) except errors.SeaFlowpyError as e: raise click.ClickException(str(e)) sfl_files = sfl_df["file"].tolist() # Find EVT files print('Getting lists of files to filter') if evt_dir: evt_files = seaflowfile.sorted_files( seaflowfile.find_evt_files(evt_dir)) elif s3_flag: # Make sure configuration for s3 is ready to go config = conf.get_aws_config(s3_only=True) cloud = clouds.AWS(config.items("aws")) # Make sure try to access S3 up front to setup AWS credentials before # launching child processes. try: evt_files = cloud.get_files(cruise) evt_files = seaflowfile.keep_evt_files( evt_files) # Only keep EVT files except botocore.exceptions.NoCredentialsError as e: print('Please configure aws first:', file=sys.stderr) print(' $ conda install aws', file=sys.stderr) print(' or', file=sys.stderr) print(' $ pip install aws', file=sys.stderr) print(' then', file=sys.stderr) print(' $ aws configure', file=sys.stderr) raise click.Abort() # Check for duplicates, exit with message if any exist uniques = {seaflowfile.SeaFlowFile(f).file_id for f in evt_files} if len(uniques) < len(evt_files): raise click.ClickException('Duplicate EVT file(s) detected') # Find intersection of SFL files and EVT files files = seaflowfile.filtered_file_list(evt_files, sfl_files) print('sfl={} evt={} intersection={}'.format(len(sfl_files), len(evt_files), len(files))) # Restrict length of file list with --limit if (limit is not None) and (limit > 0): files = files[:limit] # Filter try: filterevt.filter_evt_files(files, dbpath, opp_dir, s3=s3_flag, worker_count=process_count, every=resolution) except errors.SeaFlowpyError as e: raise click.ClickException(str(e))
def remote_filter_evt_cmd(dryrun, executable, instance_count, no_cleanup, output_dir, process_count, ramdisk_size, instance_type, dbs): """Filter EVT data on remote servers. SQLite3 db files must contain filter parameters and cruise name """ print("Started at {}{}".format(datetime.datetime.utcnow().isoformat(), os.linesep)) # Print defined parameters and information v = { 'dbs': dbs, 'executable': executable, 'output_dir': output_dir, 'dryrun': dryrun, 'instance_count': instance_count, 'no_cleanup': no_cleanup, 'process_count': process_count, 'instance_type': instance_type, 'ramdisk_size': ramdisk_size, 'version': pkg_resources.get_distribution("seaflowpy").version } to_delete = [k for k in v if v[k] is None] for k in to_delete: v.pop(k, None) # Remove undefined parameters print('Run parameters and information:') print(json.dumps(v, indent=2)) print('') # Make sure configuration for aws and ssh is ready to go config = conf.get_aws_config() conf.get_ssh_config(config) cloud = clouds.AWS(config.items('aws')) # If local executable is not given download latest from github remove_executable = False if not executable: remove_executable = True # mark this file for deletion at exit executable = download_latest_linux() # Configure fabric env.connection_attempts = 6 # Tell fabric the SSH user name and key file location env.user = config.get('ssh', 'ssh-user') env.key_filename = os.path.expanduser( config.get('ssh', 'ssh-private-key-file')) try: if len(dbs) > 0: print('Getting lists of files for each cruise') cruise_files = {} for dbfile in dbs: # Make sure file exists if not os.path.exists(dbfile): raise click.ClickException( 'DB file {} does not exist'.format(dbfile)) # Make sure db has filter parameters filled in try: filter_table = db.get_latest_filter(dbfile) except errors.SeaFlowpyError as e: raise click.ClickException( 'No filter parameters found in database file {}'. format(dbfile)) if len(filter_table) != 3: raise click.ClickException( 'Unusual filter parameters found in database file {}'. format(dbfile)) # Get cruise name DB try: c = db.get_cruise(dbfile) except errors.SeaFlowpyError as e: raise click.ClickException( 'Could not retrieve cruise name from DB. {}'.format(e)) try: evt_files = seaflowfile.sorted_files( seaflowfile.keep_evt_files(cloud.get_files(c))) except botocore.exceptions.NoCredentialsError as e: print('Please configure aws first:', file=sys.stderr) print(' $ pip install awscli', file=sys.stderr) print(' then', file=sys.stderr) print(' $ aws configure', file=sys.stderr) raise click.Abort() # Check for duplicates, exit with message if any exist uniques = { seaflowfile.SeaFlowFile(f).file_id for f in evt_files } if len(uniques) < len(evt_files): raise click.ClickException( 'Duplicate EVT file(s) detected') # Filter cruise files by SFL entries try: sfl_df = db.get_sfl_table(dbfile) except errors.SeaFlowpyError as e: print( 'Error retrieving SFL file list from DB: {}'.format(e)) return 1 sfl_files = sfl_df["file"].tolist() # Find intersection of SFL files and EVT files cruise_files[c] = seaflowfile.filtered_file_list( evt_files, sfl_files) print('{:<20} sfl={} evt={} intersection={}'.format( c, len(sfl_files), len(evt_files), len(cruise_files[c]))) print('') if dryrun: # Create dummy host list print('Creating {} dummy hosts'.format(instance_count)) env.hosts = [ 'dummy{}'.format(i) for i in range(instance_count) ] else: print('Starting {} instances'.format(instance_count)) result = cloud.start(count=instance_count, instance_type=instance_type) for iid, iip in zip(result['InstanceIds'], result['publicips']): print(' InstanceId = {}, IP = {}'.format(iid, iip)) env.hosts.extend(result['publicips']) print('') # Fairly divide cruises into hosts based on number of files print('Assigning cruises to {} hosts'.format(len(env.hosts))) host_assignments = assign_keys_to_hosts(env.hosts, cruise_files) for h in host_assignments: htotal = sum([c[1] for c in host_assignments[h]]) print('{:<20} {}'.format(h, htotal)) for c in host_assignments[h]: print(' {:<18} {}'.format(c[0], c[1])) print('') if dryrun: print('Dry run complete') print('') return 0 print('Waiting for hosts to come up with SSH') execute(wait_for_up) print('Creating initial ramdisk') with hide('output'): execute(create_ramdisk, ramdisk_size) print('Transfer AWS credentials') with hide('output'): execute(rsync_put, ['~/.aws/'], '.aws') print('Transfer seaflowpy configuration') with hide('output'): execute(rsync_put, ['~/.seaflowpy/'], '.seaflowpy') print('Transfer initial databases') execute(mkdir, REMOTE_DB_DIR) # create db dir on each host with hide('output'): execute(rsync_put, dbs, REMOTE_DB_DIR) print('Install system dependencies') execute(install_system_dependencies) print('Upload seaflowpy executable') execute(upload_seaflowpy, executable) # Host list in env.hosts should be populated now and all machines up print('Filter data') execute(filter_cruise, host_assignments, output_dir, process_count) except Exception as e: print(f'Error: {e}') finally: disconnect_all() # always disconnect SSH connections if not no_cleanup: cloud.cleanup() # clean up in case of any unhandled exceptions # Clean up seaflowpy executable we downloaded if remove_executable: try: os.remove(executable) except OSError as e: print( 'Error: could not delete temporary seaflowpy executable: {} - {}' .format(executable, e.strerror), file=sys.stderr) print('Finished at {}'.format(datetime.datetime.utcnow().isoformat())) return 0