def test_get_waveform(buffer_mb): formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') handler = logging.FileHandler('%s/a_%d.txt' % (tempdir, int(buffer_mb)), mode='w') handler.setFormatter(formatter) logger = logging.getLogger('test') logger.setLevel(logging.DEBUG) logger.addHandler(handler) fds = FederatedASDFDataSet(asdf_file_list, logger=logger, single_item_read_limit_in_mb=buffer_mb) rows = np.array( fds.get_stations('1900-01-01T00:00:00', '2100-01-01T00:00:00')) for n, s, l, c in rows[:, 0:4]: wc = fds.get_waveform_count(n, s, l, c, '1900-01-01T00:00:00', '2100-01-01T00:00:00') stream = fds.get_waveforms(n, s, l, c, '1900-01-01T00:00:00', '2100-01-01T00:00:00', trace_count_threshold=1e4) assert wc == len(stream) logger.info('%s.%s: %d traces fetched' % (n, s, len(stream)))
def test_get_closest_stations(num_neighbours): fds = FederatedASDFDataSet(asdf_file_list) netsta, dist = fds.get_closest_stations(0, 0, num_neighbours) # There are a total of 8 stations in the data set. assert len(netsta) > 0 and len(netsta) <= 8 and len(netsta) <= num_neighbours
def test_get_stations(): fds = FederatedASDFDataSet(asdf_file_list) rows = np.array(fds.get_stations('1900-01-01T00:00:00', '2100-01-01T00:00:00')) station_set = set() for n, s in rows[:, 0:2]: station_set.add((n, s)) # There are eight stations in the h5 file assert len(station_set) == 8
def test_get_coordinates(): fds = FederatedASDFDataSet(asdf_file_list) rows = np.array(fds.get_stations('1900-01-01T00:00:00', '2100-01-01T00:00:00')) station_set = set() for n, s in rows[:, 0:2]: station_set.add((n, s)) # we should have coordinates for each station assert len(fds.unique_coordinates) == len(station_set)
def test_get_local_net_sta_list(): fds = FederatedASDFDataSet(asdf_file_list) local_netsta_list = list(fds.local_net_sta_list()) rows = np.array(fds.get_stations('1900-01-01T00:00:00', '2100-01-01T00:00:00')) # Get a list of unique stations stations = set() for n, s in rows[:,0:2]: stations.add((n, s)) # end for # On serial runs, all stations should be allocated to rank 0 assert len(local_netsta_list) == len(stations)
def __init__(self, asdf_file_name, netsta_list='*'): self._data_path = asdf_file_name self._earth_radius = 6371 # km self.fds = FederatedASDFDataSet(asdf_file_name) # Gather station metadata netsta_list_subset = set( netsta_list.split(' ')) if netsta_list != '*' else netsta_list self.netsta_list = [] self.metadata = defaultdict(list) rtps = [] for netsta in list(self.fds.unique_coordinates.keys()): if (netsta_list_subset != '*'): if netsta not in netsta_list_subset: continue self.netsta_list.append(netsta) self.metadata[netsta] = self.fds.unique_coordinates[netsta] rtps.append([ self._earth_radius, np.radians(90 - self.metadata[netsta][1]), np.radians(self.metadata[netsta][0]) ]) # end for rtps = np.array(rtps) xyzs = rtp2xyz(rtps[:, 0], rtps[:, 1], rtps[:, 2]) self._tree = cKDTree(xyzs) self._cart_location = defaultdict(list) for i, ns in enumerate(self.netsta_list): self._cart_location[ns] = xyzs[i, :]
def test_db_integrity(): fds = FederatedASDFDataSet(asdf_file_list) # get number of waveforms from the db directly conn = sqlite3.connect(fds.fds.db_fn) query = 'select count(*) from wdb;' db_waveform_count = conn.execute(query).fetchall()[0][0] # fetch waveform counts for each unique combination of net, sta, loc, cha waveform_count = 0 rows = fds.get_stations('1900-01-01T00:00:00', '2100-01-01T00:00:00') for row in rows: n, s, l, c, _, _ = row waveform_count += fds.get_waveform_count(n, s, l, c, '1900:01:01T00:00:00', '2100:01:01T00:00:00') # end for assert waveform_count == db_waveform_count
def process(asdf_source, start_time, end_time, net, sta, cha, output_basename): """ ASDF_SOURCE: Text file containing a list of paths to ASDF files\n START_TIME: Start time in UTCDateTime format\n END_TIME: End time in UTCDateTime format\n NET: Network name\n STA: Station name ('*' for all stations; note that * must be in quotation marks)\n CHA: Channel name ('*' for all channels; note that * must be in quotation marks) \n OUTPUT_BASENAME: Basename of output file Example usage: mpirun -np 112 python plot_data_quality.py asdf_files.txt 1980:01:01 2020:01:01 OA '*' '*' data_quality.oa """ start_time = UTCDateTime(start_time) end_time = UTCDateTime(end_time) if (sta == '*'): sta = None if (cha == '*'): cha = None comm = MPI.COMM_WORLD nproc = comm.Get_size() rank = comm.Get_rank() l = setup_logger(name=output_basename, log_file='%s.log' % output_basename) fds = FederatedASDFDataSet(asdf_source, logger=l) stations = [] if rank == 0: stations = fds.get_stations(start_time, end_time, network=net, station=sta, channel=cha) stations = split_list(sorted(stations), nproc) # end if stations = comm.bcast(stations, root=0) results = process_data(rank, fds, sorted(stations[rank]), start_time, end_time) results = comm.gather(results, root=0) if rank == 0: results = [item for sublist in results for item in sublist] # flatten sublists for each proc stations = [item for sublist in stations for item in sublist] # flatten sublists for each proc plot_results(stations, results, output_basename)
def test_get_global_time_range(): fds = FederatedASDFDataSet(asdf_file_list) rows = np.array(fds.get_stations('1900-01-01T00:00:00', '2100-01-01T00:00:00')) station_set = set() for n, s in rows[:, 0:2]: station_set.add((n, s)) minlist =[] maxlist = [] for (n, s) in station_set: min, max = fds.get_global_time_range(n, s) minlist.append(min) maxlist.append(max) # end for min = UTCDateTime(np.array(minlist).min()) max = UTCDateTime(np.array(maxlist).max()) # Ensure aggregate min/max to corresponding values in the db assert min == UTCDateTime('2000-01-01T00:00:00.000000Z') assert max == UTCDateTime('2002-01-01T00:00:00.000000Z')
def aggregate(input_folder, output_file, folder_mask, station_database, max_depth_km, depth_levels): """ Scrape together all the trans-D inversion solutions and collect into volumetric dataset. :param input_folder: Folder containing solutions to scrape together :type input_folder: str or Path :param output_file: Output file (must not exist already) :type output_file: str or Path (pdf extension expected) """ # Open station database from which to get station lat,lon coordinates station_location_db = FederatedASDFDataSet( station_database).unique_coordinates # Process folders in alphanumerical order folders = sorted(glob.glob(os.path.join(input_folder, folder_mask))) # regex pattern for matching case strings containing network, station and channel codes case_pattern = '^([a-zA-Z0-9]+)_([a-zA-Z0-9]+)_([a-zA-Z0-9]+)' matcher = re.compile(case_pattern) # Container for storing Vs as a function of depth for each station. station_profiles = [] # Loop over folders one at a time for f in folders: # If it is a folder and has a solution file in it. if os.path.isdir(f) and os.path.isfile(os.path.join(f, SOLUTION_FILE)): _, case_folder = os.path.split(f) case_meta = matcher.match(case_folder) # Extract network, station and channel metadata from folder name net = case_meta.group(1) sta = case_meta.group(2) cha = case_meta.group(3) station_id = '.'.join([net, sta, cha]) soln_file = os.path.join(f, SOLUTION_FILE) # station_coords are in lon,lat order station_coords = station_location_db['.'.join([net, sta])] print(station_id, station_coords) # Open solution file and collect relevant fields with open(soln_file, 'r') as posterior: post_dat = posterior.readlines() # end with _0, depth_discretization, depth_max = post_dat[0].strip( '\n').split(None) depth_discretization = int(depth_discretization) depth_max = float(depth_max) z_range = depth_max * (np.arange(depth_discretization) + 0.5) / depth_discretization Vs_min, Vs_max, vel_discretization, _width = post_dat[1].strip( '\n').split(None) vel_discretization = int(vel_discretization) Vs_min, Vs_max = float(Vs_min), float(Vs_max) vel_range = Vs_min + (Vs_max - Vs_min) * ( np.arange(vel_discretization) + 0.5) / vel_discretization # Each row of posterior_distribution corresponds to a discrete depth. At each depth, # we have a velocity probability distribution based on MCMC sampling. posterior_distribution = np.reshape( np.array([float(x.strip('\n')) for x in post_dat[2:]]), (depth_discretization, vel_discretization)) # Normalize the distribution at each depth. post = posterior_distribution / np.expand_dims( np.sum(posterior_distribution, axis=-1), -1) assert np.allclose(np.sum(post, axis=-1), 1) # Compute mean at each depth, reducing the 2D posterior to 1D # velocity as a function of depth. vel_mean = np.dot(post, vel_range) # Create 4-column 2D matrix storing results for this station. xy_range = np.array([[station_coords[0], station_coords[1]]] * depth_levels) interpolator = interp1d(z_range, vel_mean, kind='cubic') z_interp = max_depth_km * (np.arange(depth_levels) + 0.5) / depth_levels vel_interp = interpolator(z_interp) data_all = np.column_stack([xy_range, z_interp, vel_interp]) station_profiles.append(data_all) # end if # end for data_all = np.vstack(station_profiles) np.save(output_file, data_all, allow_pickle=False) print('Saved {} size array to {}'.format(data_all.shape, output_file))
def main(infile, fds_file, sheet_names, k): """ Process Excel spreadsheet into point dataset based on station codes. Example usage: python hk_stations2point.py --fds-file /g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt \ network_hk_data_sample.xlsx Output format is csv file containing point data in the form of lon/lat coordinates and depth measurement. For example: # Sta,Lon,Lat,Depth I8,133.035951,-19.473353,37.9 H8,133.006100,-20.003900,45.9 G8,132.997000,-20.486800,40.8 F8,132.991205,-20.997177,47.3 D8,132.989100,-21.506900,30.0 ... Output file name is inferred from input Excel file name with extension changed to '.csv' :param infile: Input Excel file containing bespoke digitized Moho depths :param fds_file: Index file used to instantiate FederatedASDFDataSet :return: None """ with xlrd.open_workbook(infile) as wb: if not sheet_names: sheet_names = wb.sheet_names() print('Processing all sheets:\n', sheet_names) else: _sheet_names = wb.sheet_names() for name in sheet_names: assert name in _sheet_names, 'Sheet {} not found in workbook!'.format(name) fds = FederatedASDFDataSet(fds_file) sta_coords = fds.unique_coordinates pts = [] with xlrd.open_workbook(infile) as wb: for sheet_name in sheet_names: sheet = wb.sheet_by_name(sheet_name) print('Processing sheet {}'.format(sheet_name)) try: network = sheet.cell_value(0, 0) network = network.split()[-1] except IndexError: print('Network code not found in string "{}", exiting'.format(network)) exit(1) network = NETWORK_CODE_MAPPINGS.get(network, network) # end try for i, row in enumerate(sheet.get_rows()): if i == 0: print('Skipping header row:', row) continue # end if if not row or not row[0].value: break # end if station = str(row[0].value) for sc in SPECIAL_CHARS: station = station.split(sc)[0] station = '.'.join([network, station]) if k: val = float(row[2].value) else: val = float(row[1].value) if np.isnan(val): print(f"Invalid depth value for {station}, skipping") continue coords = sta_coords[station] if not coords: print(f"Couldn't find coordinates for {station}, skipping") pt_data = [station] + coords + [val] pts.append(pt_data) all_data = np.array(pts) print('Collected {} samples from {} sheets'.format(all_data.shape[0], len(sheet_names))) filebase = os.path.splitext(infile)[0] outfile = filebase + '.csv' print('Saving point data to file "{}"'.format(outfile)) header = 'Sta,Lon,Lat,' header = header + 'K' if k else header + 'Depth' np.savetxt(outfile, all_data, fmt=['%s', '%s', '%s', '%s'], delimiter=',', header=header)
:return: tuples containing [net, sta, start_time, end_time]; start- and end-times are instances of obspy.UTCDateTime """ for item in self.fds.local_net_sta_list(): yield item # end for # end func # end class if __name__ == "__main__": """ How to Run Example:: python ASDFdatabase/FederatedASDFDataSet.py /Datasets/asdf_file_index.txt Upon success, a db file will be created: /Datasets/f374ca9e7dd8abd2a1d58575e0d55520f30ffc23.db """ import sys from seismic.ASDFdatabase.FederatedASDFDataSet import FederatedASDFDataSet if len(sys.argv) < 2: print("******** USAGE: python3 %s %s **********" % (sys.argv[0], "asdf_file_list_txt")) sys.exit(1) asdf_file_list = sys.argv[1] ds = FederatedASDFDataSet(asdf_file_list)
def main(inventory_file, waveform_database, event_catalog_file, rf_trace_datafile, start_time, end_time, taup_model, distance_range, magnitude_range): log = logging.getLogger(__name__) log.setLevel(logging.INFO) waveform_db_is_web = is_url( waveform_database ) or waveform_database in obspy.clients.fdsn.header.URL_MAPPINGS if not waveform_db_is_web: assert os.path.exists( waveform_database), "Cannot find waveform database file {}".format( waveform_database) log.info("Using waveform data source: {}".format(waveform_database)) assert not os.path.exists(rf_trace_datafile), \ "Won't delete existing file {}, remove manually.".format(rf_trace_datafile) min_dist_deg = distance_range[0] max_dist_deg = distance_range[1] min_mag = magnitude_range[0] max_mag = magnitude_range[1] inventory = read_inventory(inventory_file) log.info("Loaded inventory {}".format(inventory_file)) # Compute reference lonlat from the inventory. channels = inventory.get_contents()['channels'] lonlat_coords = [] for ch in channels: coords = inventory.get_coordinates(ch) lonlat_coords.append((coords['longitude'], coords['latitude'])) lonlat_coords = np.array(lonlat_coords) lonlat = np.mean(lonlat_coords, axis=0) log.info("Inferred reference coordinates {}".format(lonlat)) # If start and end time not provided, infer from date range of inventory. if not start_time: start_time = inventory[0].start_date for net in inventory: start_time = min(start_time, net.start_date) log.info("Inferred start time {}".format(start_time)) # end if if not end_time: end_time = inventory[0].end_date if end_time is None: end_time = UTC.now() for net in inventory: end_time = max(end_time, net.end_date) log.info("Inferred end time {}".format(end_time)) # end if start_time = UTC(start_time) end_time = UTC(end_time) event_catalog_file = timestamp_filename(event_catalog_file, start_time, end_time) rf_trace_datafile = timestamp_filename(rf_trace_datafile, start_time, end_time) log.info("Traces will be written to: {}".format(rf_trace_datafile)) exit_after_catalog = False catalog = get_events(lonlat, start_time, end_time, event_catalog_file, (min_dist_deg, max_dist_deg), (min_mag, max_mag), exit_after_catalog) if waveform_db_is_web: existing_index = None log.info("Use fresh query results from web") client = Client(waveform_database) waveform_getter = client.get_waveforms else: # Form closure to allow waveform source file to be derived from a setting (or command line input) asdf_dataset = FederatedASDFDataSet(waveform_database, logger=log) def closure_get_waveforms(network, station, location, channel, starttime, endtime): return custom_get_waveforms(asdf_dataset, network, station, location, channel, starttime, endtime) existing_index = _get_existing_index(rf_trace_datafile) if existing_index is not None: log.warning( "Resuming extraction using existing index from file {}".format( rf_trace_datafile)) waveform_getter = closure_get_waveforms # end if with tqdm(smoothing=0) as pbar: stream_count = 0 for s in iter_event_data(catalog, inventory, waveform_getter, tt_model=taup_model, pbar=pbar): # Write traces to output file in append mode so that arbitrarily large file # can be processed. If the file already exists, then existing streams will # be overwritten rather than duplicated. # Check first if rotation for unaligned *H1, *H2 channels to *HN, *HE is required. if s.select(component='1') and s.select(component='2'): s.rotate('->ZNE', inventory=inventory) # end if # Order the traces in ZNE ordering. This is required so that normalization # can be specified in terms of an integer index, i.e. the default of 0 in rf # library will normalize against the Z component. s.traces = sorted(s.traces, key=zne_order) # Assert the ordering of traces in the stream is ZNE. assert s.traces[0].stats.channel[-1] == 'Z' assert s.traces[1].stats.channel[-1] == 'N' assert s.traces[2].stats.channel[-1] == 'E' # Loop over ZNE traces for tr in s: grp_id = '.'.join(tr.id.split('.')[0:3]) event_time = str(tr.meta.event_time)[0:19] pbar.set_description("{} -- {}".format(grp_id, event_time)) if existing_index is not None: # Skip records that already exist in the file to speed up generation if grp_id in existing_index and event_time in existing_index[ grp_id]: pbar.write( "Skipping {} -- {} already exists in output file". format(grp_id, event_time)) continue else: # Use don't override mode just in case our hand-crafted index is faulty stream_count += 1 tr.write(rf_trace_datafile, 'H5', mode='a', override='dont') else: stream_count += 1 tr.write(rf_trace_datafile, 'H5', mode='a') # end for # end for if stream_count == 0: log.warning("No traces found!") else: log.info("Wrote {} new stream to output file".format(stream_count))
# TODO: Fix resource management here so that asdf_files_dir gets deleted when tests finished/finalized. path = os.path.dirname(os.path.abspath(__file__)) # Initialize input data asdf_files_dir = tempfile.mkdtemp(suffix='_test') asdf_file_list1 = os.path.join(asdf_files_dir, 'asdf_file_list1.txt') asdf_file_list2 = os.path.join(asdf_files_dir, 'asdf_file_list2.txt') f1 = open(asdf_file_list1, 'w+') f2 = open(asdf_file_list2, 'w+') f1.write('%s/data/test_data_ARMA.h5\n' % (path)) f2.write('%s/data/test_data_QLP.h5\n' % (path)) f1.close() f2.close() fds1 = FederatedASDFDataSet(asdf_file_list1) fds2 = FederatedASDFDataSet(asdf_file_list2) # Initialize input inventory inv = None inv = read_inventory('%s/data/response_inventory.fdsnxml' % (path)) # Unzip expected results expected_folder = tempfile.mkdtemp() cmd = 'tar -zxvf %s -C %s' % ('%s/data/expected/expected.tar.gz' % path, expected_folder) os.system(cmd) @pytest.fixture(params=['BHZ', '00T']) def cha(request):
def main(infile, fds_file): """ Process Excel spreadsheet into point dataset based on line profiles. Example usage: python ccp_lines2point.py --fds-file /g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt \ ccp_line_data_sample.xlsx Output format is csv file containing point data in the form of lon/lat coordinates and depth measurement. For example: # Lon,Lat,Depth 134.909765,-17.572545,47.9 135.017670,-17.570829,47.3 135.134567,-17.568970,48.9 135.395337,-17.564823,52.1 135.494250,-17.563250,52.1 ... Output file name is inferred from input Excel file name with extension changed to '.csv' :param infile: Input Excel file containing bespoke digitized Moho depths :param fds_file: Index file used to instantiate FederatedASDFDataSet :return: None """ with xlrd.open_workbook(infile) as wb: sheet = wb.sheet_by_index(0) network = sheet.cell_value(0, 3) lines_row = sheet.row_values(3) lines = [line for line in lines_row if line] # end with df = pd.read_excel(infile, header=4) df.drop(df.columns[0], axis=1, inplace=True) fds = FederatedASDFDataSet(fds_file) sta_coords = fds.unique_coordinates vol_data_dict = {} for i, line in enumerate(lines): line = line.strip() sta_start, sta_end = line.split(',') sta_start = sta_start.strip() sta_end = sta_end.strip() start = '.'.join([network, sta_start]) end = '.'.join([network, sta_end]) start = np.array(sta_coords[start]) end = np.array(sta_coords[end]) assert np.any(end != start) dirn = (end - start) dirn = dirn / np.linalg.norm(dirn) dist_col = df.iloc[:, 3 * i + 1] dist_col = pd.to_numeric(dist_col, errors='coerce').astype(float) valid = dist_col.notna() if not np.any(valid): continue dist = dist_col[valid].values - LEAD_INOUT_DIST_KM depth = df.iloc[:, 3 * i + 2][valid].values lonlat = start + np.outer(dist, dirn) / KM_PER_DEG # Difficult to correct for differences in station elevation because # FDS does not include it in station coords. Ignore for now. vol_data = np.hstack((lonlat, depth[:, np.newaxis])) vol_data_dict[line] = vol_data # end for filebase = os.path.splitext(infile)[0] outfile = filebase + '.csv' all_data = np.vstack(tuple(v for v in vol_data_dict.values())) np.savetxt(outfile, all_data, fmt=['%.6f', '%.6f', '%.1f'], delimiter=',', header='Lon,Lat,Depth')
def main(infile, fds_file, raise_errors=False): """ Process Excel spreadsheet into point dataset based on line profiles. Example usage: python ccp_lines2point.py --fds-file /g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt \ ccp_line_data_sample.xlsx Output format is csv file containing point data in the form of sta, lon/lat coordinates and depth measurement. For example: # Sta,Lon,Lat,Depth I8,134.909765,-17.572545,47.9 H8,135.017670,-17.570829,47.3 G8,135.134567,-17.568970,48.9 F8,135.395337,-17.564823,52.1 D8,135.494250,-17.563250,52.1 ... Output file name is inferred from input Excel file name with extension changed to '.csv' :param infile: Input Excel file containing bespoke digitized Moho depths :param fds_file: Index file used to instantiate FederatedASDFDataSet :return: None """ fds = FederatedASDFDataSet(fds_file) sta_coords = fds.unique_coordinates all_network_codes = {s.split('.')[0] for s in sta_coords.keys()} vol_data_list = [] with xlrd.open_workbook(infile) as wb: sheet = wb.sheet_by_index(0) for i, sheet in enumerate(wb.sheets()): network = sheet.cell_value(0, 3) network = NETWORK_CODE_MAPPINGS.get(network, network) lines_row = sheet.row_values(3) lines = [line for line in lines_row if line] df = pd.read_excel(infile, sheet_name=i, header=4) df.drop(df.columns[0], axis=1, inplace=True) for i, line in enumerate(lines): line = line.strip() sta_start, sta_end = line.split(',') sta_start = sta_start.strip() sta_end = sta_end.strip() netsta_start = '.'.join([network, sta_start]) netsta_end = '.'.join([network, sta_end]) start = np.array(sta_coords[netsta_start]) end = np.array(sta_coords[netsta_end]) if start.size == 0 or end.size == 0: msg = f"Can't get coordinates for {netsta_start} or {netsta_end}" if raise_errors: raise Exception(msg) else: print(msg) continue if not np.any(end != start): msg = f"Invalid profile line {netsta_start} to {netsta_end}" if raise_errors: raise Exception(msg) else: print(msg) continue dirn = (end - start) dirn = dirn / np.linalg.norm(dirn) dist_col = df.iloc[:, 3 * i + 1] dist_col = pd.to_numeric(dist_col, errors='coerce').astype(float) valid = dist_col.notna() if not np.any(valid): msg = f"No valid values for profile line {netsta_start} to {netsta_end}" if raise_errors: raise Exception(msg) else: print(msg) continue dist = dist_col[valid].values - LEAD_INOUT_DIST_KM depth = df.iloc[:, 3 * i + 2][valid].values stations = df.iloc[:, 3 * i][valid].values for sc in SPECIAL_CHARS: stations = [s.split(sc)[0] for s in stations] stations = np.array(['.'.join([network, s]) for s in stations]) lonlat = start + np.outer(dist, dirn) / KM_PER_DEG vol_data = np.hstack( (stations[:, np.newaxis], lonlat, depth[:, np.newaxis])) vol_data_list.append(vol_data) filebase = os.path.splitext(infile)[0] outfile = filebase + '.csv' all_data = np.vstack(tuple(v for v in vol_data_list)) np.savetxt(outfile, all_data, fmt=['%s', '%s', '%s', '%s'], delimiter=',', header='Sta,Lon,Lat,Depth')
def process(asdf_source, event_folder, output_path, min_magnitude, restart, save_quality_plots): """ ASDF_SOURCE: Text file containing a list of paths to ASDF files EVENT_FOLDER: Path to folder containing event files\n OUTPUT_PATH: Output folder \n """ comm = MPI.COMM_WORLD nproc = comm.Get_size() rank = comm.Get_rank() proc_workload = None if (rank == 0): def outputConfigParameters(): # output config parameters fn = 'pick.%s.cfg' % (datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) fn = os.path.join(output_path, fn) f = open(fn, 'w+') f.write('Parameter Values:\n\n') f.write('%25s\t\t: %s\n' % ('ASDF_SOURCE', asdf_source)) f.write('%25s\t\t: %s\n' % ('EVENT_FOLDER', event_folder)) f.write('%25s\t\t: %s\n' % ('OUTPUT_PATH', output_path)) f.write('%25s\t\t: %s\n' % ('MIN_MAGNITUDE', min_magnitude)) f.write('%25s\t\t: %s\n' % ('RESTART_MODE', 'TRUE' if restart else 'FALSE')) f.write('%25s\t\t: %s\n' % ('SAVE_PLOTS', 'TRUE' if save_quality_plots else 'FALSE')) f.close() # end func outputConfigParameters() # end if # ================================================== # Create output-folder for snr-plots # ================================================== plot_output_folder = None if (save_quality_plots): plot_output_folder = os.path.join(output_path, 'plots') if (rank == 0): if (not os.path.exists(plot_output_folder)): os.mkdir(plot_output_folder) # end if comm.Barrier() # end if # ================================================== # Read catalogue and retrieve origin times # ================================================== cat = CatalogCSV(event_folder) events = cat.get_events() originTimestamps = cat.get_preferred_origin_timestamps() # ================================================== # Create lists of pickers for both p- and s-arrivals # ================================================== sigmalist = np.arange(8, 3, -1) pickerlist_p = [] pickerlist_s = [] for sigma in sigmalist: picker_p = aicdpicker.AICDPicker(t_ma=5, nsigma=sigma, t_up=1, nr_len=5, nr_coeff=2, pol_len=10, pol_coeff=10, uncert_coeff=3) picker_s = aicdpicker.AICDPicker(t_ma=15, nsigma=sigma, t_up=1, nr_len=5, nr_coeff=2, pol_len=10, pol_coeff=10, uncert_coeff=3) pickerlist_p.append(picker_p) pickerlist_s.append(picker_s) # end for # ================================================== # Define theoretical model # Instantiate data-access object # Retrieve estimated workload # ================================================== taupyModel = TauPyModel(model='iasp91') fds = FederatedASDFDataSet(asdf_source, use_json_db=False, logger=None) workload = getWorkloadEstimate(fds, originTimestamps) # ================================================== # Define output header and open output files # depending on the mode of operation (fresh/restart) # ================================================== header = '#eventID originTimestamp mag originLon originLat originDepthKm net sta cha pickTimestamp stationLon stationLat az baz distance ttResidual snr qualityMeasureCWT domFreq qualityMeasureSlope bandIndex nSigma\n' ofnp = os.path.join(output_path, 'p_arrivals.%d.txt' % (rank)) ofns = os.path.join(output_path, 's_arrivals.%d.txt' % (rank)) ofp = None ofs = None if (restart == False): ofp = open(ofnp, 'w+') ofs = open(ofns, 'w+') ofp.write(header) ofs.write(header) else: ofp = open(ofnp, 'a+') ofs = open(ofns, 'a+') # end if progTracker = ProgressTracker(output_folder=output_path, restart_mode=restart) totalTraceCount = 0 for nc, sc, start_time, end_time in fds.local_net_sta_list(): day = 24 * 3600 dayCount = 0 curr = start_time traceCountP = 0 pickCountP = 0 traceCountS = 0 pickCountS = 0 sw_start = datetime.now() step = day while (curr < end_time): if (curr + step > end_time): step = end_time - curr # end if eventIndices = (np.where((originTimestamps >= curr.timestamp) & \ (originTimestamps <= (curr + day).timestamp)))[0] if (eventIndices.shape[0] > 0): totalTraceCount += 1 stations = fds.get_stations(curr, curr + day, network=nc, station=sc) stations_zch = [s for s in stations if 'Z' in s[3]] # only Z channels stations_nch = [ s for s in stations if 'N' in s[3] or '1' in s[3] ] # only N channels stations_ech = [ s for s in stations if 'E' in s[3] or '2' in s[3] ] # only E channels for codes in stations_zch: if (progTracker.increment()): pass else: continue st = fds.get_waveforms(codes[0], codes[1], codes[2], codes[3], curr, curr + step, automerge=True, trace_count_threshold=200) if (len(st) == 0): continue dropBogusTraces(st) slon, slat = codes[4], codes[5] for ei in eventIndices: event = events[ei] po = event.preferred_origin da = gps2dist_azimuth(po.lat, po.lon, slat, slon) mag = None if (event.preferred_magnitude): mag = event.preferred_magnitude.magnitude_value elif (len(po.magnitude_list)): mag = po.magnitude_list[0].magnitude_value if (mag == None): mag = np.NaN if (np.isnan(mag) or mag < min_magnitude): continue result = extract_p( taupyModel, pickerlist_p, event, slon, slat, st, plot_output_folder=plot_output_folder) if (result): picklist, residuallist, snrlist, bandindex, pickerindex = result arcdistance = kilometers2degrees(da[0] / 1e3) for ip, pick in enumerate(picklist): line = '%s %f %f %f %f %f ' \ '%s %s %s %f %f %f ' \ '%f %f %f ' \ '%f %f %f %f %f '\ '%d %d\n' % (event.public_id, po.utctime.timestamp, mag, po.lon, po.lat, po.depthkm, codes[0], codes[1], codes[3], pick.timestamp, slon, slat, da[1], da[2], arcdistance, residuallist[ip], snrlist[ip, 0], snrlist[ip, 1], snrlist[ip, 2], snrlist[ip, 3], bandindex, sigmalist[pickerindex]) ofp.write(line) # end for ofp.flush() pickCountP += 1 # end if if (len(stations_nch) == 0 and len(stations_ech) == 0): result = extract_s( taupyModel, pickerlist_s, event, slon, slat, st, None, da[2], plot_output_folder=plot_output_folder) if (result): picklist, residuallist, snrlist, bandindex, pickerindex = result arcdistance = kilometers2degrees(da[0] / 1e3) for ip, pick in enumerate(picklist): line = '%s %f %f %f %f %f ' \ '%s %s %s %f %f %f ' \ '%f %f %f ' \ '%f %f %f %f %f ' \ '%d %d\n' % (event.public_id, po.utctime.timestamp, mag, po.lon, po.lat, po.depthkm, codes[0], codes[1], codes[3], pick.timestamp, slon, slat, da[1], da[2], arcdistance, residuallist[ip], snrlist[ip, 0], snrlist[ip, 1], snrlist[ip, 2], snrlist[ip, 3], bandindex, sigmalist[pickerindex]) ofs.write(line) # end for ofs.flush() pickCountS += 1 # end if # end if # end for traceCountP += len(st) # end for if (len(stations_nch) > 0 and len(stations_nch) == len(stations_ech)): for codesn, codese in zip(stations_nch, stations_ech): if (progTracker.increment()): pass else: continue stn = fds.get_waveforms(codesn[0], codesn[1], codesn[2], codesn[3], curr, curr + step, automerge=True, trace_count_threshold=200) ste = fds.get_waveforms(codese[0], codese[1], codese[2], codese[3], curr, curr + step, automerge=True, trace_count_threshold=200) dropBogusTraces(stn) dropBogusTraces(ste) if (len(stn) == 0): continue if (len(ste) == 0): continue slon, slat = codesn[4], codesn[5] for ei in eventIndices: event = events[ei] po = event.preferred_origin da = gps2dist_azimuth(po.lat, po.lon, slat, slon) mag = None if (event.preferred_magnitude): mag = event.preferred_magnitude.magnitude_value elif (len(po.magnitude_list)): mag = po.magnitude_list[0].magnitude_value if (mag == None): mag = np.NaN if (np.isnan(mag) or mag < min_magnitude): continue result = extract_s( taupyModel, pickerlist_s, event, slon, slat, stn, ste, da[2], plot_output_folder=plot_output_folder) if (result): picklist, residuallist, snrlist, bandindex, pickerindex = result arcdistance = kilometers2degrees(da[0] / 1e3) for ip, pick in enumerate(picklist): line = '%s %f %f %f %f %f ' \ '%s %s %s %f %f %f ' \ '%f %f %f ' \ '%f %f %f %f %f ' \ '%d %d\n' % (event.public_id, po.utctime.timestamp, mag, po.lon, po.lat, po.depthkm, codesn[0], codesn[1], '00T', pick.timestamp, slon, slat, da[1], da[2], arcdistance, residuallist[ip], snrlist[ip, 0], snrlist[ip, 1], snrlist[ip, 2], snrlist[ip, 3], bandindex, sigmalist[pickerindex]) ofs.write(line) # end for ofs.flush() pickCountS += 1 # end if # end for traceCountS += (len(stn) + len(ste)) # end for # end if # end if curr += step dayCount += 1 # wend sw_stop = datetime.now() totalTime = (sw_stop - sw_start).total_seconds() gc.collect() print '(Rank %d: %5.2f%%, %d/%d) Processed %d traces and found %d p-arrivals and %d s-arrivals for ' \ 'network %s station %s in %f s. Memory usage: %5.2f MB.' % \ (rank, (float(totalTraceCount) / float(workload) * 100) if workload > 0 else 100, totalTraceCount, workload, traceCountP + traceCountS, pickCountP, pickCountS, nc, sc, totalTime, round(psutil.Process().memory_info().rss / 1024. / 1024., 2)) # end for ofp.close() ofs.close() print 'Processing complete on rank %d' % (rank) del fds
#Parallelised autopick harvester. We have like a million picks so this is the only way to go about it import sys from seismic.ASDFdatabase.FederatedASDFDataSet import FederatedASDFDataSet from seismic.ml_classifier.data_harvester.autopicks import pickLoaderRand from obspy.clients.fdsn.client import Client ic = Client("IRIS") fds = FederatedASDFDataSet( '/g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt', variant='db', use_json_db=True, logger=None) import numpy as np pl = pickLoaderRand(fds, ic) import multiprocessing as mp nproc = mp.cpu_count() print(nproc) def lockInit(l): global lock lock = l l = mp.Lock() pool = mp.Pool(processes=nproc, initializer=lockInit, initargs=(l, ))
from obspy.signal.detrend import simple, spline from obspy.signal.filter import bandpass from seismic.ASDFdatabase.FederatedASDFDataSet import FederatedASDFDataSet from obspy import UTCDateTime, read_events, read_inventory from obspy.taup.taup_geo import calc_dist from obspy.clients.iris import Client as IrisClient from obspy.clients.fdsn import Client from obspy.taup import TauPyModel from obspy.signal.trigger import trigger_onset, z_detect, classic_sta_lta, recursive_sta_lta, ar_pick from obspy.signal.rotate import rotate_ne_rt from obspy.core.event import Pick, CreationInfo, WaveformStreamID, ResourceIdentifier, Arrival, Event, Origin, Arrival, \ OriginQuality, Magnitude, Comment fds = FederatedASDFDataSet( '/g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt', logger=None) stations = fds.get_stations('2009-05-17T00:00:00', '2009-05-18T00:00:00', station='QLP') print(stations) s = fds.get_waveforms('AU', 'QLP', '', 'BHE', '2011-03-15T00:00:00', '2011-03-16T00:00:00', trace_count_threshold=10) print(s)
def main(inventory_file, waveform_database, event_catalog_file, event_trace_datafile, start_time, end_time, taup_model, distance_range, magnitude_range, catalog_only=False): log = logging.getLogger(__name__) log.setLevel(logging.INFO) waveform_db_is_web = is_url( waveform_database ) or waveform_database in obspy.clients.fdsn.header.URL_MAPPINGS if not waveform_db_is_web: assert os.path.exists( waveform_database), "Cannot find waveform database file {}".format( waveform_database) log.info("Using waveform data source: {}".format(waveform_database)) min_dist_deg = distance_range[0] max_dist_deg = distance_range[1] min_mag = magnitude_range[0] max_mag = magnitude_range[1] inventory = read_inventory(inventory_file) log.info("Loaded inventory {}".format(inventory_file)) # Compute reference lonlat from the inventory. channels = inventory.get_contents()['channels'] lonlat_coords = [] for ch in channels: coords = inventory.get_coordinates(ch) lonlat_coords.append((coords['longitude'], coords['latitude'])) lonlat_coords = np.array(lonlat_coords) lonlat = np.mean(lonlat_coords, axis=0) log.info("Inferred reference coordinates {}".format(lonlat)) # If start and end time not provided, infer from date range of inventory. if not start_time: start_time = inventory[0].start_date for net in inventory: start_time = min(start_time, net.start_date) log.info("Inferred start time {}".format(start_time)) # end if if not end_time: end_time = inventory[0].end_date if end_time is None: end_time = UTC.now() for net in inventory: end_time = max(end_time, net.end_date) log.info("Inferred end time {}".format(end_time)) # end if start_time = UTC(start_time) end_time = UTC(end_time) event_catalog_file = timestamp_filename(event_catalog_file, start_time, end_time) event_trace_datafile = timestamp_filename(event_trace_datafile, start_time, end_time) assert not os.path.exists(event_trace_datafile), \ "Output file {} already exists, please remove!".format(event_trace_datafile) log.info("Traces will be written to: {}".format(event_trace_datafile)) exit_after_catalog = catalog_only catalog = get_events(lonlat, start_time, end_time, event_catalog_file, (min_dist_deg, max_dist_deg), (min_mag, max_mag), exit_after_catalog) if waveform_db_is_web: log.info("Use fresh query results from web") client = Client(waveform_database) waveform_getter = client.get_waveforms else: # Form closure to allow waveform source file to be derived from a setting (or command line input) asdf_dataset = FederatedASDFDataSet(waveform_database, logger=log) def closure_get_waveforms(network, station, location, channel, starttime, endtime): return asdf_get_waveforms(asdf_dataset, network, station, location, channel, starttime, endtime) waveform_getter = closure_get_waveforms # end if with tqdm(smoothing=0) as pbar: stream_count = 0 for s in iter_event_data(catalog, inventory, waveform_getter, tt_model=taup_model, pbar=pbar): # Write traces to output file in append mode so that arbitrarily large file # can be processed. If the file already exists, then existing streams will # be overwritten rather than duplicated. # Check first if rotation for unaligned *H1, *H2 channels to *HN, *HE is required. if not s: continue # end if if s.select(component='1') and s.select(component='2'): try: s.rotate('->ZNE', inventory=inventory) except ValueError as e: log.error('Unable to rotate to ZNE with error:\n{}'.format( str(e))) continue # end try # end if # Order the traces in ZNE ordering. This is required so that normalization # can be specified in terms of an integer index, i.e. the default of 0 in rf # library will normalize against the Z component. s.traces = sorted(s.traces, key=zne_order) # Assert the ordering of traces in the stream is ZNE. assert s[0].stats.channel[-1] == 'Z' assert s[1].stats.channel[-1] == 'N' assert s[2].stats.channel[-1] == 'E' # Iterator returns rf.RFStream. Write traces from obspy.Stream to decouple from RFStream. grp_id = '.'.join(s.traces[0].id.split('.')[0:3]) event_time = str(s.traces[0].meta.event_time)[0:19] pbar.set_description("{} -- {}".format(grp_id, event_time)) out_stream = obspy.Stream([tr for tr in s]) assert out_stream[0].stats.channel[-1] == 'Z' assert out_stream[1].stats.channel[-1] == 'N' assert out_stream[2].stats.channel[-1] == 'E' write_h5_event_stream(event_trace_datafile, out_stream, mode='a') stream_count += 1 # end for if stream_count == 0: log.warning("No traces found!") else: log.info("Wrote {} streams to output file".format(stream_count))