def test_unclustered(self): """Test clustering on unclustered data...""" testing_path = os.path.join(self.testing_path, 'WAV', 'TEST_') stream_files = glob.glob(os.path.join(testing_path, '*DFDPC*'))[0:10] stream_list = [(read(stream_file), i) for i, stream_file in enumerate(stream_files)] for st in stream_list: for tr in st[0]: if tr.stats.sampling_rate != 100.0: ratio = tr.stats.sampling_rate / 100 if int(ratio) == ratio: tr.decimate(int(ratio)) else: tr.resample(100) shortest_tr = min( [tr.stats.npts for st in stream_list for tr in st[0]]) for st in stream_list: for tr in st[0]: tr.data = tr.data[0:shortest_tr] groups = cluster(template_list=stream_list, show=False, corr_thresh=0.3) self.assertEqual(len(groups), 10) # They shouldn't cluster at all # Test setting a number of cores groups_2 = cluster(template_list=stream_list, show=False, corr_thresh=0.3, cores=2, save_corrmat=True) self.assertTrue(os.path.isfile('dist_mat.npy')) os.remove('dist_mat.npy') self.assertEqual(len(groups_2), 10) # They shouldn't cluster at all self.assertEqual(groups, groups_2)
def test_clustered(self): """Test clustering on clustered data...""" groups = cluster(template_list=[ (st, i) for i, st in enumerate(self.stream_list) ], show=False, corr_thresh=0.3) self.assertEqual(len(groups), 9)
def test_unclustered(self): """Test clustering on unclustered data...""" testing_path = os.path.join(self.testing_path, 'WAV', 'TEST_') stream_files = glob.glob(os.path.join(testing_path, '*DFDPC*'))[0:10] stream_list = [(read(stream_file), i) for i, stream_file in enumerate(stream_files)] groups = cluster(template_list=stream_list, show=False, corr_thresh=0.3) self.assertEqual(len(groups), 10) # They shouldn't cluster at all # Test setting a number of cores groups_2 = cluster(template_list=stream_list, show=False, corr_thresh=0.3, cores=2, debug=2, save_corrmat=True) self.assertTrue(os.path.isfile('dist_mat.npy')) os.remove('dist_mat.npy') self.assertEqual(len(groups_2), 10) # They shouldn't cluster at all self.assertEqual(groups, groups_2)
def test_unclustered(self): """Test clustering on unclustered data...""" from obspy import read import glob import os from eqcorrscan.utils.clustering import cluster testing_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'test_data', 'WAV', 'TEST_') stream_files = glob.glob(os.path.join(testing_path, '*'))[0:10] stream_list = [(read(stream_file), i) for i, stream_file in enumerate(stream_files)] groups = cluster(template_list=stream_list, show=False, corr_thresh=0.3) self.assertEqual(len(groups), 10) # They shouldn't cluster at all
def test_clustered(self): """Test clustering on clustered data...""" testing_path = os.path.join(self.testing_path, 'similar_events') stream_files = glob.glob(os.path.join(testing_path, '*')) stream_list = [(read(stream_file), i) for i, stream_file in enumerate(stream_files)] for stream in stream_list: for tr in stream[0]: if tr.stats.station not in ['WHAT2', 'WV04', 'GCSZ']: stream[0].remove(tr) continue tr.detrend('simple') tr.filter('bandpass', freqmin=5.0, freqmax=15.0) tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45) groups = cluster(template_list=stream_list, show=False, corr_thresh=0.3) self.assertEqual(len(groups), 9) # They should cluster reasonably
def test_clustered(self): """Test clustering on clustered data...""" from obspy import read import glob import os from eqcorrscan.utils.clustering import cluster testing_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'test_data', 'similar_events') stream_files = glob.glob(os.path.join(testing_path, '*')) stream_list = [(read(stream_file), i) for i, stream_file in enumerate(stream_files)] for stream in stream_list: for tr in stream[0]: if tr.stats.station not in ['WHAT2', 'WV04', 'GCSZ']: stream[0].remove(tr) continue tr.detrend('simple') tr.filter('bandpass', freqmin=5.0, freqmax=15.0) tr.trim(tr.stats.starttime + 40, tr.stats.endtime - 45) groups = cluster(template_list=stream_list, show=False, corr_thresh=0.3) self.assertEqual(len(groups), 9) # They should cluster reasonably
def cluster_tribe(tribe, raw_wav_dir, lowcut, highcut, samp_rate, filt_order, pre_pick, length, shift_len, corr_thresh, cores, dist_mat=False, show=False): """ Cross correlate all templates in a tribe and return separate tribes for each cluster :param tribe: :return: .. Note: Functionality here is pilaged from align design as we don't want the multiplexed portion of that function. """ tribe.sort() raw_wav_files = glob('%s/*' % raw_wav_dir) raw_wav_files.sort() all_wavs = [wav.split('/')[-1].split('.')[0] for wav in raw_wav_files] names = [t.name for t in tribe if t.name in all_wavs] wavs = [ wav for wav in raw_wav_files if wav.split('/')[-1].split('.')[0] in names ] new_tribe = Tribe() new_tribe.templates = [temp for temp in tribe if temp.name in names] print('Processing temps') temp_list = [(shortproc(read(tmp), lowcut=lowcut, highcut=highcut, samp_rate=samp_rate, filt_order=filt_order, parallel=True, num_cores=cores), template) for tmp, template in zip(wavs, new_tribe)] print('Clipping traces') for temp in temp_list: print('Clipping template %s' % temp[1].name) for tr in temp[0]: pk = [ pk for pk in temp[1].event.picks if pk.waveform_id.station_code == tr.stats.station and pk.waveform_id.channel_code == tr.stats.channel ][0] tr.trim(starttime=pk.time - shift_len - pre_pick, endtime=pk.time - pre_pick + length + shift_len) trace_lengths = [ tr.stats.endtime - tr.stats.starttime for st in temp_list for tr in st[0] ] clip_len = min(trace_lengths) - (2 * shift_len) stachans = list( set([(tr.stats.station, tr.stats.channel) for st in temp_list for tr in st[0]])) print('Aligning traces') for stachan in stachans: trace_list = [] trace_ids = [] for i, st in enumerate(temp_list): tr = st[0].select(station=stachan[0], channel=stachan[1]) if len(tr) > 0: trace_list.append(tr[0]) trace_ids.append(i) if len(tr) > 1: warnings.warn('Too many matches for %s %s' % (stachan[0], stachan[1])) shift_len_samples = int(shift_len * trace_list[0].stats.sampling_rate) shifts, cccs = stacking.align_traces(trace_list=trace_list, shift_len=shift_len_samples, positive=True) for i, shift in enumerate(shifts): st = temp_list[trace_ids[i]][0] start_t = st.select(station=stachan[0], channel=stachan[1])[0].stats.starttime start_t += shift_len start_t -= shift st.select(station=stachan[0], channel=stachan[1])[0].trim(start_t, start_t + clip_len) print('Clustering') if isinstance(dist_mat, np.ndarray): groups = cluster_from_dist_mat(dist_mat=dist_mat, temp_list=temp_list, show=show, corr_thresh=corr_thresh) else: groups = clustering.cluster(temp_list, show=show, corr_thresh=corr_thresh, allow_shift=False, save_corrmat=True, cores=cores) group_tribes = [] for group in groups: group_tribes.append( Tribe(templates=[ Template(st=tmp[0], name=tmp[1].name, event=tmp[1].event, highcut=highcut, lowcut=lowcut, samp_rate=samp_rate, filt_order=filt_order, prepick=pre_pick) for tmp in group ])) return group_tribes
index=False, header=False) for st in inv.networks[0].stations: stream_list_l = [(Stream(traces=[ tr, ]), i) for i, tr in enumerate( stream.select(component='L', station=st.code).trim2(-5, 20, 'onset'))] stream_list = [(Stream(traces=[ tr, ]), i) for i, tr in enumerate( stream.select(component='Q', station=st.code).trim2(-5, 20, 'onset'))] try: groups = cluster(template_list=stream_list, show=False, corr_thresh=0.3, cores=4) if len(groups) < len(stream_list): group_max = groups[0] for g in groups: if len(g) > len(group_max): group_max = g group_max_l = Stream( traces=[stream_list_l[g[1]][0] for g in group_max]) group_streams = [st_tuple[0] for st_tuple in group_max] group_streams_l = [ Stream(traces=[ st_tuple[0], ]) for st_tuple in group_max_l ] stack = PWS_stack(streams=group_streams)
continue elif len(cat) < cpu_count(): cores = len(cat) elif len(cat) >= cpu_count(): cores = 'all' grp_num = space_cat.split('/')[-1].split('_')[-1].rstrip('.xml') template_list = [(template_dict[ev.resource_id], ev.resource_id) for ev in cat] plt_name = '/media/chet/hdd/seismic/NZ/catalogs/corr_figs/1_sec_temps/' +\ 'spacegrp_%s_dend_0.20.png' % grp_num corr_mat = '/media/chet/hdd/seismic/NZ/catalogs/corr_figs/1_sec_temps/' +\ 'spacegrp_%s_mat.npy' % grp_num groups = clustering.cluster(template_list, corr_thresh=0.30, allow_shift=True, shift_len=25, save_corrmat=True, cores=cores, debug=2) for i, grp in enumerate(groups): corrgrp_cat = Catalog() f_name_root = '/media/chet/hdd/seismic/NZ/catalogs/' f_name = 'spacegrp_%s_corrgrp_%03d' % (grp_num, i) for e in cat: for temp_st in grp: if e.resource_id == temp_st[1]: corrgrp_cat.append(e) corrgrp_cat.write(f_name_root + 'qml/corr_groups/1_sec_temps/' + f_name + '.xml', format="QUAKEML") corrgrp_cat.write(f_name_root + 'shp/corr_groups/1_sec_temps/' +
def cluster_cat(catalog, corr_thresh, corr_params=None, raw_wav_dir=None, dist_mat=False, out_cat=None, show=False, method='average'): """ Cross correlate all templates in a tribe and return separate tribes for each cluster :param tribe: Tribe to cluster :param corr_thresh: Correlation threshold for clustering :param corr_params: Dictionary of filter parameters. Must include keys: lowcut, highcut, samp_rate, filt_order, pre_pick, length, shift_len, cores :param raw_wav_dir: Directory of waveforms to take from :param dist_mat: If there's a precomputed distance matrix, use this instead of doing all the correlations :param out_cat: Output catalog corresponding to the events :param show: Show the dendrogram? Careful as this can exceed max recursion :param wavs: Should we even bother with processing waveforms? Otherwise will just populate the tribe with an empty Stream :return: .. Note: Functionality here is pilaged from align design as we don't want the multiplexed portion of that function. """ if corr_params and raw_wav_dir: shift_len = corr_params['shift_len'] lowcut = corr_params['lowcut'] highcut = corr_params['highcut'] samp_rate = corr_params['samp_rate'] filt_order = corr_params['filt_order'] pre_pick = corr_params['pre_pick'] length = corr_params['length'] cores = corr_params['cores'] raw_wav_files = glob('%s/*' % raw_wav_dir) raw_wav_files.sort() all_wavs = [wav.split('/')[-1].split('_')[-3] for wav in raw_wav_files] print(all_wavs[0]) names = [ ev.resource_id.id.split('/')[-1] for ev in catalog if ev.resource_id.id.split('/')[-1] in all_wavs ] print(names[0]) wavs = [ wav for wav in raw_wav_files if wav.split('/')[-1].split('_')[-3] in names ] print(wavs[0]) new_cat = Catalog(events=[ ev for ev in catalog if ev.resource_id.id.split('/')[-1] in names ]) print('Processing temps') temp_list = [(shortproc(read('{}/*'.format(tmp)), lowcut=lowcut, highcut=highcut, samp_rate=samp_rate, filt_order=filt_order, parallel=True, num_cores=cores), ev.resource_id.id.split('/')[-1]) for tmp, ev in zip(wavs, new_cat)] print('Clipping traces') rm_temps = [] for i, temp in enumerate(temp_list): print('Clipping template %s' % new_cat[i].resource_id.id) rm_ts = [] # Make a list of traces with no pick to remove rm_ev = [] for tr in temp[0]: pk = [ pk for pk in new_cat[i].picks if pk.waveform_id.station_code == tr.stats.station and pk.waveform_id.channel_code == tr.stats.channel ] if len(pk) == 0: rm_ts.append(tr) else: tr.trim(starttime=pk[0].time - shift_len - pre_pick, endtime=pk[0].time - pre_pick + length + shift_len) # Remove pickless traces for rm in rm_ts: temp[0].traces.remove(rm) # If trace lengths are internally inconsistent, remove template if len(list(set([len(tr) for tr in temp[0]]))) > 1: rm_temps.append(temp) # If template is now length 0, remove it and associated event if len(temp[0]) == 0: rm_temps.append(temp) rm_ev.append(new_cat[i]) for t in rm_temps: temp_list.remove(t) # Remove the corresponding events as well so catalog and distmat # are the same shape for rme in rm_ev: new_cat.events.remove(rme) print(new_cat) new_cat.write(out_cat, format="QUAKEML") print('Clustering') if isinstance(dist_mat, np.ndarray): print('Assuming the tribe provided is the same shape as dist_mat') # Dummy streams temp_list = [(Stream(), ev) for ev in catalog] groups = cluster_from_dist_mat(dist_mat=dist_mat, temp_list=temp_list, show=show, corr_thresh=corr_thresh, method=method) else: groups = clustering.cluster(temp_list, show=show, corr_thresh=corr_thresh, shift_len=shift_len * 2, save_corrmat=True, cores=cores) group_tribes = [] group_cats = [] if corr_params: for group in groups: group_tribes.append( Tribe(templates=[ Template(st=tmp[0], name=tmp[1].resource_id.id.split('/')[-1], event=tmp[1], highcut=highcut, lowcut=lowcut, samp_rate=samp_rate, filt_order=filt_order, prepick=pre_pick) for tmp in group ])) group_cats.append(Catalog(events=[tmp[1] for tmp in group])) else: for group in groups: group_tribes.append( Tribe(templates=[ Template(st=tmp[0], name=tmp[1].resource_id.id.split('/')[-1], event=tmp[1].event, highcut=None, lowcut=None, samp_rate=None, filt_order=None, prepick=None) for tmp in group ])) group_cats.append(Catalog(events=[tmp[1] for tmp in group])) return group_tribes, group_cats
import numpy as np temp_dir = '/media/rotnga_data/templates/2015_dayproc/*' temp_files = glob(temp_dir) temp_files.sort() template_list = [] files_wo_data = [] for filename in temp_files: try: template_list.append(read(filename)) except TypeError: print('No actual data in this file') files_wo_data.append(filename) #Run hierarchical clustering function groups = clustering.cluster(template_list, show=False, corr_thresh=0.28, save_corrmat=True, debug=2) """ Now compute the SVD (or empirical approximation) for each family of MORE THAN ONE event Use SVD() or empirical_SVD() """ #First, empirical_SVD first_subspace = [] second_subspace = [] for group in groups: if len(group) > 1: [first, second] = clustering.empirical_SVD(group) #Account for np.diff() returning array with len one less than original for tr in second: tr.data = np.concatenate(([0.0], tr.data))