def rot_12_NE(st: Stream, meta): ''' Performs a 12 -> NE rotation. :param st: A stream containing the traces to rotate. :param dict meta: Dictionary contaning the metadata for all streams. :return: The rotated streams. ''' st2 = st.select(channel="??1") for tr in st2: id1 = tr.id id2 = id1[:-1] + '2' tr1 = tr try: tr2 = st.select(id=id2)[0] except IndexError: st.remove(tr) logger.warning("%s Channel 2 not found. Impossible to rotate", tr.id) continue timeA = max(tr1.stats.starttime, tr2.stats.starttime) timeB = min(tr1.stats.endtime, tr2.stats.endtime) tr1.trim(timeA, timeB) tr2.trim(timeA, timeB) azi = meta[id1]['azimuth'] tr1.data, tr2.data = rot2D(tr1.data, tr2.data, -azi) return st
def test_xyzalgorithm_uneccesary_channel_empty(): """XYZAlgorithm_test.test_xyzalgorithm_uneccesary_channel_gaps() confirms the process will run when an uneccesary channel is input but contains gaps or is completely empty. ie. gaps in 'Z' channel or and empty 'F' channel. This also makes sure the 'Z' and 'F' channels are passed without any modification. """ algorithm = XYZAlgorithm("obs", "mag") timeseries = Stream() timeseries += __create_trace("H", [1, 1]) timeseries += __create_trace("E", [1, 1]) timeseries += __create_trace("Z", [1, np.NaN]) timeseries += __create_trace("F", [np.NaN, np.NaN]) outstream = algorithm.process(timeseries) assert_equal( outstream.select(channel="Z")[0].data.all(), timeseries.select(channel="Z")[0].data.all(), ) assert_equal( outstream.select(channel="F")[0].data.all(), timeseries.select(channel="F")[0].data.all(), ) ds = outstream.select(channel="D") # there is 1 trace assert_equal(len(ds), 1) d = ds[0] # d has 2 values (same as input) assert_equal(len(d.data), 2) # d has no NaN values assert_equal(np.isnan(d).any(), False)
def test_xyzalgorithm_uneccesary_channel_empty(): """XYZAlgorithm_test.test_xyzalgorithm_uneccesary_channel_gaps() confirms the process will run when an uneccesary channel is input but contains gaps or is completely empty. ie. gaps in 'Z' channel or and empty 'F' channel. This also makes sure the 'Z' and 'F' channels are passed without any modification. """ algorithm = XYZAlgorithm('obs', 'mag') timeseries = Stream() timeseries += __create_trace('H', [1, 1]) timeseries += __create_trace('E', [1, 1]) timeseries += __create_trace('Z', [1, np.NaN]) timeseries += __create_trace('F', [np.NaN, np.NaN]) outstream = algorithm.process(timeseries) assert_equals(outstream.select(channel='Z')[0].data.all(), timeseries.select(channel='Z')[0].data.all()) assert_equals(outstream.select(channel='F')[0].data.all(), timeseries.select(channel='F')[0].data.all()) ds = outstream.select(channel='D') # there is 1 trace assert_equals(len(ds), 1) d = ds[0] # d has 2 values (same as input) assert_equals(len(d.data), 2) # d has no NaN values assert_equals(np.isnan(d).any(), False)
def save_raw(saved: dict, st: Stream, rawloc: str, inv: Inventory, saveasdf: bool): """ Save the raw waveform data in the desired format. The point of this function is mainly that the waveforms will be saved with the correct associations and at the correct locations. :param saved: Dictionary holding information about the original streams to identify them afterwards. :type saved: dict :param st: obspy stream holding all data (from various stations) :type st: Stream :param rawloc: Parental directory (with phase) to save the files in. :type rawloc: str :param inv: The inventory holding all the station information :type inv: Inventory :param saveasdf: If True the data will be saved in asdf format. :type saveasdf: bool """ # Just use the same name for evt, startt, endt, net, stat in zip(saved['event'], saved['startt'], saved['endt'], saved['net'], saved['stat']): # earlier we downloaded all locations, but we don't really want # to have several, so let's just keep one try: sst = st.select(network=net, station=stat) # This might actually be empty if so, let's just skip if sst.count() == 0: logging.debug(f'No trace of {net}.{stat} in Stream.') continue slst = sst.slice(startt, endt) # Only write the prevelant location locs = [tr.stats.location for tr in sst] filtloc = max(set(locs), key=locs.count) sslst = slst.select(location=filtloc) if saveasdf: sinv = inv.select(net, stat, starttime=startt, endtime=endt) write_st(sslst, evt, rawloc, sinv) else: save_raw_mseed(evt, sslst, rawloc, net, stat) except Exception as e: logging.error(e)
def analyze_data(families, staloc, nhour, t1, duration, dt, ncpu, icpu): """ """ nfamilies = int(ceil(len(families) / ncpu)) ibegin = icpu * nfamilies iend = min((icpu + 1) * nfamilies, len(families)) for i in range(ibegin, iend): # Create directory to store the LFEs times namedir = 'LFEs/' + families['family'].iloc[i] if not os.path.exists(namedir): os.makedirs(namedir) # File to write error messages namedir = 'error' if not os.path.exists(namedir): os.makedirs(namedir) errorfile = 'error/' + families['family'].iloc[i] + '.txt' # Create dataframe to store LFE times df = pd.DataFrame(columns=['year', 'month', 'day', 'hour', \ 'minute', 'second', 'cc', 'nchannel']) # Read the templates stations = families['stations'].iloc[i].split(',') templates = Stream() for station in stations: data = pickle.load(open(template_dir + '/' + families['family'].iloc[i] + \ '/' + station + '.pkl', 'rb')) if (len(data) == 3): EW = data[0] NS = data[1] UD = data[2] EW.stats.station = station NS.stats.station = station EW.stats.channel = 'E' NS.stats.channel = 'N' templates.append(EW) templates.append(NS) else: UD = data[0] UD.stats.station = station UD.stats.channel = 'Z' templates.append(UD) # Loop on hours of data for hour in range(0, nhour): nchannel = 0 Tstart = t1 + hour * 3600.0 Tend = t1 + (hour + 1) * 3600.0 + duration delta = Tend - Tstart ndata = int(delta / dt) + 1 # Get the data data = [] for station in stations: try: D = read('tmp/' + station + '.mseed') D = D.slice(Tstart, Tend) namefile = 'tmp/' + station + '.pkl' orientation = pickle.load(open(namefile, 'rb')) # Get station metadata for reading response file for ir in range(0, len(staloc)): if (station == staloc['station'][ir]): network = staloc['network'][ir] channels = staloc['channels'][ir] location = staloc['location'][ir] server = staloc['server'][ir] # Orientation of template # Date chosen: April 1st 2008 mychannels = channels.split(',') mylocation = location if (mylocation == '--'): mylocation = '' response = '../data/response/' + network + '_' + station + '.xml' inventory = read_inventory(response, format='STATIONXML') reference = [] for channel in mychannels: angle = inventory.get_orientation(network + '.' + \ station + '.' + mylocation + '.' + channel, \ UTCDateTime(2008, 4, 1, 0, 0, 0)) reference.append(angle) # Append data to stream if (type(D) == obspy.core.stream.Stream): stationdata = fill_data(D, orientation, station, channels, reference) if (len(stationdata) > 0): for stream in stationdata: data.append(stream) except: message = 'No data available for station {} '.format( \ station) + 'at time {}/{}/{} - {}:{}:{}\n'.format( \ Tstart.year, Tstart.month, Tstart.day, Tstart.hour, \ Tstart.minute, Tstart.second) # Loop on channels for channel in range(0, len(data)): subdata = data[channel] # Check whether we have a complete one-hour-long recording if (len(subdata) == 1): if (len(subdata[0].data) == ndata): # Get the template station = subdata[0].stats.station component = subdata[0].stats.channel template = templates.select(station=station, \ component=component)[0] # Cross correlation cctemp = correlate.optimized(template, subdata[0]) if (nchannel > 0): cc = np.vstack((cc, cctemp)) else: cc = cctemp nchannel = nchannel + 1 if (nchannel > 0): # Compute average cross-correlation across channels meancc = np.mean(cc, axis=0) if (type_threshold == 'MAD'): MAD = np.median(np.abs(meancc - np.mean(meancc))) index = np.where(meancc >= threshold * MAD) elif (type_threshold == 'Threshold'): index = np.where(meancc >= threshold) else: raise ValueError('Type of threshold must be MAD or Threshold') times = np.arange(0.0, np.shape(meancc)[0] * dt, dt) # Get LFE times if np.shape(index)[1] > 0: (time, cc) = clean_LFEs(index, times, meancc, dt, freq0) # Add LFE times to dataframe i0 = len(df.index) for j in range(0, len(time)): timeLFE = Tstart + time[j] df.loc[i0 + j] = [int(timeLFE.year), int(timeLFE.month), \ int(timeLFE.day), int(timeLFE.hour), \ int(timeLFE.minute), timeLFE.second + \ timeLFE.microsecond / 1000000.0, cc[j], nchannel] # Add to pandas dataframe and save namefile = 'LFEs/' + families['family'].iloc[i] + '/catalog.pkl' if os.path.exists(namefile): df_all = pickle.load(open(namefile, 'rb')) df_all = pd.concat([df_all, df], ignore_index=True) else: df_all = df df_all = df_all.astype(dtype={'year':'int32', 'month':'int32', \ 'day':'int32', 'hour':'int32', 'minute':'int32', \ 'second':'float', 'cc':'float', 'nchannel':'int32'}) pickle.dump(df_all, open(namefile, 'wb'))
def find_LFEs(family_file, station_file, template_dir, tbegin, tend, \ TDUR, duration, filt, freq0, dt, nattempts, waittime, type_threshold='MAD', \ threshold=0.0075): """ Find LFEs with the temporary stations from FAME using the templates from Plourde et al. (2015) Input: type family_file = string family_file = File containing the list of LFE families type station_file = string station_file = File containing the list of stations type template_dir = string template_dir = Directory where to find the LFE templates type tbegin = tuplet of 6 integers tbegin = Time when we begin looking for LFEs type tend = tuplet of 6 integers tend = Time we stop looking for LFEs type TDUR = float TDUR = Time to add before and after the time window for tapering type duration = float duration = Duration of the LFE templates type filt = tuple of floats filt = Lower and upper frequencies of the filter type freq0 = float freq0 = Maximum frequency rate of LFE occurrence type dt = float dt = Time step for the LFE templates type nattempts = integer nattempts = Number of times we try to download data type waittime = positive float waittime = Type to wait between two attempts at downloading type type_threshold = string type_threshold = 'MAD' or 'Threshold' type threshold = float threshold = Cross correlation value must be higher than that Output: None """ # Get the network, channels, and location of the stations staloc = pd.read_csv(station_file, \ sep=r'\s{1,}', header=None, engine='python') staloc.columns = ['station', 'network', 'channels', 'location', \ 'server', 'latitude', 'longitude', 'time_on', 'time_off'] # Begin and end time of analysis t1 = UTCDateTime(year=tbegin[0], month=tbegin[1], \ day=tbegin[2], hour=tbegin[3], minute=tbegin[4], \ second=tbegin[5]) t2 = UTCDateTime(year=tend[0], month=tend[1], \ day=tend[2], hour=tend[3], minute=tend[4], \ second=tend[5]) # Number of hours of data to analyze nhour = int(ceil((t2 - t1) / 3600.0)) # Begin and end time of downloading Tstart = t1 - TDUR Tend = t2 + duration + TDUR # Temporary directory to store the data namedir = 'tmp' if not os.path.exists(namedir): os.makedirs(namedir) # Download the data from the stations for ir in range(0, len(staloc)): station = staloc['station'][ir] network = staloc['network'][ir] channels = staloc['channels'][ir] location = staloc['location'][ir] server = staloc['server'][ir] time_on = staloc['time_on'][ir] time_off = staloc['time_off'][ir] # File to write error messages namedir = 'error' if not os.path.exists(namedir): os.makedirs(namedir) errorfile = 'error/' + station + '.txt' # Check whether there are data for this period of time year_on = int(time_on[0:4]) month_on = int(time_on[5:7]) day_on = int(time_on[8:10]) year_off = int(time_off[0:4]) month_off = int(time_off[5:7]) day_off = int(time_off[8:10]) if ((Tstart > UTCDateTime(year=year_on, month=month_on, day=day_on)) \ and (Tend < UTCDateTime(year=year_off, month=month_off, day=day_off))): # First case: we can get the data from IRIS if (server == 'IRIS'): (D, orientation) = get_from_IRIS(station, network, channels, \ location, Tstart, Tend, filt, dt, nattempts, waittime, \ errorfile, DATADIR) # Second case: we get the data from NCEDC elif (server == 'NCEDC'): (D, orientation) = get_from_NCEDC(station, network, channels, \ location, Tstart, Tend, filt, dt, nattempts, waittime, \ errorfile, DATADIR) else: raise ValueError( 'You can only download data from IRIS and NCEDC') # Store the data into temporary files if (type(D) == obspy.core.stream.Stream): D.write('tmp/' + station + '.mseed', format='MSEED') namefile = 'tmp/' + station + '.pkl' pickle.dump(orientation, open(namefile, 'wb')) # Loop on families families = pd.read_csv(family_file, \ sep=r'\s{1,}', header=None, engine='python') families.columns = ['family', 'stations'] for i in range(0, len(families)): # Create directory to store the LFEs times namedir = 'LFEs/' + families['family'].iloc[i] if not os.path.exists(namedir): os.makedirs(namedir) # File to write error messages namedir = 'error' if not os.path.exists(namedir): os.makedirs(namedir) errorfile = 'error/' + families['family'].iloc[i] + '.txt' # Create dataframe to store LFE times df = pd.DataFrame(columns=['year', 'month', 'day', 'hour', \ 'minute', 'second', 'cc', 'nchannel']) # Read the templates stations = families['stations'].iloc[i].split(',') templates = Stream() for station in stations: templatefile = template_dir + '/' + \ families['family'].iloc[i] + '/' + station + '.pkl' with open(templatefile, 'rb') as f: data = pickle.load(f) if (len(data) == 3): EW = data[0] NS = data[1] UD = data[2] EW.stats.station = station NS.stats.station = station EW.stats.channel = 'E' NS.stats.channel = 'N' templates.append(EW) templates.append(NS) else: UD = data[0] UD.stats.station = station UD.stats.channel = 'Z' templates.append(UD) # Loop on hours of data for hour in range(0, nhour): nchannel = 0 Tstart = t1 + hour * 3600.0 Tend = t1 + (hour + 1) * 3600.0 + duration delta = Tend - Tstart ndata = int(delta / dt) + 1 # Get the data data = [] for station in stations: try: D = read('tmp/' + station + '.mseed') D = D.slice(Tstart, Tend) namefile = 'tmp/' + station + '.pkl' orientation = pickle.load(open(namefile, 'rb')) # Get station metadata for reading response file for ir in range(0, len(staloc)): if (station == staloc['station'][ir]): network = staloc['network'][ir] channels = staloc['channels'][ir] location = staloc['location'][ir] server = staloc['server'][ir] # Orientation of template # Date chosen: April 1st 2008 mychannels = channels.split(',') mylocation = location if (mylocation == '--'): mylocation = '' response = os.path.join( DATADIR, 'response/') + network + '_' + station + '.xml' inventory = read_inventory(response, format='STATIONXML') reference = [] for channel in mychannels: angle = inventory.get_orientation(network + '.' + \ station + '.' + mylocation + '.' + channel, \ UTCDateTime(2020, 1, 1, 0, 0, 0)) reference.append(angle) # Append data to stream if (type(D) == obspy.core.stream.Stream): stationdata = fill_data(D, orientation, station, channels, reference) if (len(stationdata) > 0): for stream in stationdata: data.append(stream) except: message = 'No data available for station {} '.format( \ station) + 'at time {}/{}/{} - {}:{}:{}\n'.format( \ Tstart.year, Tstart.month, Tstart.day, Tstart.hour, \ Tstart.minute, Tstart.second) # Loop on channels for channel in range(0, len(data)): subdata = data[channel] # Check whether we have a complete one-hour-long recording if (len(subdata) == 1): if (len(subdata[0].data) == ndata): # Get the template station = subdata[0].stats.station component = subdata[0].stats.channel template = templates.select(station=station, \ component=component)[0] # Cross correlation cctemp = correlate.optimized(template, subdata[0]) if (nchannel > 0): cc = np.vstack((cc, cctemp)) else: cc = cctemp nchannel = nchannel + 1 if (nchannel > 0): # Compute average cross-correlation across channels meancc = np.mean(cc, axis=0) if (type_threshold == 'MAD'): MAD = np.median(np.abs(meancc - np.mean(meancc))) index = np.where(meancc >= threshold * MAD) elif (type_threshold == 'Threshold'): index = np.where(meancc >= threshold) else: raise ValueError( 'Type of threshold must be MAD or Threshold') times = np.arange(0.0, np.shape(meancc)[0] * dt, dt) # Get LFE times if np.shape(index)[1] > 0: (time, cc) = clean_LFEs(index, times, meancc, dt, freq0) # Add LFE times to dataframe i0 = len(df.index) for j in range(0, len(time)): timeLFE = Tstart + time[j] df.loc[i0 + j] = [int(timeLFE.year), int(timeLFE.month), \ int(timeLFE.day), int(timeLFE.hour), \ int(timeLFE.minute), timeLFE.second + \ timeLFE.microsecond / 1000000.0, cc[j], nchannel] # Add to pandas dataframe and save df_all = df df_all = df_all.astype(dtype={'year':'int32', 'month':'int32', \ 'day':'int32', 'hour':'int32', 'minute':'int32', \ 'second':'float', 'cc':'float', 'nchannel':'int32'}) df_all.to_csv('LFEs/' + families['family'].iloc[i] + '/catalog_' + \ '{:04d}{:02d}{:02d}_{:02d}{:02d}{:02d}'.format(tbegin[0], \ tbegin[1], tbegin[2], tbegin[3], tbegin[4], tbegin[5]) + '.csv')
class GetIIData(object): # initialize input vars def __init__(self, year, startday, network, **kwargs): # initialize year/start/net # if statement to check for main args set QUERY=True # else sys.exit(1) if (year != "") and (startday != "") and (network != ""): self.year = year self.startday = startday self.network = network QUERY = True else: QUERY = False # loop through **kwargs and initialize optargs self.endday = "" # init endday string self.station = "" # init station string self.location = "" # init location string self.channel = "" # init channel string self.debug = False # init debug self.archive = False # init archive endday = self.endday for key,val in kwargs.iteritems(): if key == "endday": self.endday = val elif key == "station": self.station = val elif key == "location": self.location = val elif key == "channel": self.channel = val elif key == "debug": self.debug = self.toBool(val) elif key == "archive": self.archive = self.toBool(val) # print arguments if 'debug' mode if self.debug: print "Year: " + self.year print "Start Day: " + self.startday print "End Day: " + self.endday print "Network: " + self.network print "Station: " + self.station print "Location: " + self.location print "Channel: " + self.channel # handle wildcards if self.location == "?": self.location = "*" if self.channel == "?": self.channel = "*" if self.station == "?": self.station = "*" # set start/end to UTCDateTime object #-------------------------------------------------------------------- self.startTime = UTCDateTime(year + startday +"T00:00:00.000") # If no end day in parser default to 1 day if self.endday == "?": self.endday = str(int(self.startday) + 1).zfill(3) self.endTime = self.startTime + 24*60*60 else: self.endTime = UTCDateTime(year + self.endday +"T00:00:00.000") print "Here is our start time: " + self.startTime.formatIRISWebService() print "Here is our end time: " + self.endTime.formatIRISWebService() self.days = int(self.endday)- int(self.startday) # there are 24, 1 hour increments in a day self.hours = (int(self.endday)- int(self.startday)) * 24 # Will only run if main args are given # check QUERY flag if True continue if QUERY: self.queryData() else: print '\nNo main args given.' print 'Exiting\n' sys.exit(1) def queryData(self): # code from IRIS client # Here we pull the data client = Client("IRIS") DupStations = [] DupLocations = [] DupChannels = [] self.st = Stream() self.STAWILD = False self.LOCWILD = False self.CHANWILD = False try: timeout = 300 socket.setdefaulttimeout(timeout) # this needs to have a get_waveform that queries data 1 hour at a time # data cant query right now if the data is too bulky # also needs to include a timeout exception for hourIndex in range(0,self.hours): #this cant be days... has to be hours self.startTime1 = self.startTime + (hourIndex)*1*60*60 self.endTime1 = self.startTime + (hourIndex+1)*1*60*60 requestArray = [(self.network,self.station,self.location, \ self.channel,self.startTime1,self.endTime1)] self.st1 = client.get_waveforms_bulk(requestArray) self.st += self.st1 print self.st print #self.st = client.get_waveforms_bulk(timeout=10,requestArray) for self.tr in self.st: #Here we remove the M data quality and go with D self.tr.stats.mseed['dataquality'] = 'D' if self.debug: if self.station == '*': self.STAWILD = True DupStations.append(self.tr.stats.station) elif self.station != '*': self.STAWILD = False if self.location == '*': self.LOCWILD = True DupLocations.append(self.tr.stats.location) elif self.location != '*': self.LOCWILD = False if self.channel == '*': self.CHANWILD = True DupChannels.append(self.tr.stats.channel) elif self.channel != '*': self.CHANWILD = False #except TimeoutError: #print 'Get waveform timeout, exiting...' #sys.exit(0) except: print 'Trouble getting data' sys.exit(0) # Takes duplicate stations out of list and # makes station, location, and channel into an array # for looping( probably easier way but it works) self.stations = list(set(DupStations)) if self.station != '*': self.stations.append(self.station) self.locations = list(set(DupLocations)) if self.location != '*': self.locations.append(self.location) self.channels = list(set(DupChannels)) if self.channel != '*': self.channels.append(self.channel) print print "Station(s) being pulled: " + str(self.stations) print "Location(s) being pulled: " + str(self.locations) print "Channel(s) being pulled: " + str(self.channels) # Now call code to store streams in mseed files self.storeMSEED() def storeMSEED(self): #Main program #code for storing MSEED files codepath = '/home/mkline/dev/getIIdataBackup/TEST_ARCHIVE/' self.stFinal = Stream() for self.channel in self.channels: self.trace2 = self.st.select(channel = self.channel) for self.location in self.locations: self.trace1 = self.trace2.select(location = self.location) for self.station in self.stations: print print "For station, location, and channel: " \ + self.station +" "+ self.location +" "+ self.channel trace = self.trace1.select(station = self.station) trace.merge() trace.sort() trace.count() for dayIndex in range(0,self.days): print "Day properties: " #startTime works better than trace[0].stats.starttime trimStart = self.startTime + (dayIndex)*24*60*60 trimEnd = self.startTime + (dayIndex+1)*24*60*60 print "Start of day: " + str(trimStart) print "End of day: " + str(trimEnd) #Converting date into julian day to store in directory timesplit = re.split('T', str(trimStart)) s = timesplit[0] fmt = '%Y-%m-%d' dt = datetime.datetime.strptime(s, fmt) tt = dt.timetuple() NewStartDay = str(tt.tm_yday).zfill(3) self.stFinal = trace.copy() self.stFinal.trim(starttime = trimStart, endtime = trimEnd) # This if statement is used to make sure traces with no # data dont get added to the directory structure if not self.stFinal or str(self.stFinal[0].max()) == '--': print "No trace for given day" else: #Added the directory structures in here since you won't want to #add directory structures that you don't use self.stFinal = self.stFinal.split() if not os.path.exists(codepath + self.network + '_' + self.station + '/'): os.mkdir(codepath + self.network + '_' + self.station + '/') if not os.path.exists(codepath + self.network + '_' + self.station + '/' \ + self.year + '/'): os.mkdir(codepath + self.network + '_' + self.station + '/' \ + self.year + '/') stpath = codepath + self.network + '_' + self.station + '/' + self.year + \ '/' + self.year + '_' + NewStartDay + '/' if not os.path.exists(stpath): os.mkdir(stpath) # Here we write the data using STEIM 2 and 512 record lengths self.stFinal.write(stpath + self.stFinal[0].stats.location + '_' + \ self.stFinal[0].stats.channel + '.512.seed', format='MSEED', \ reclen = 512, encoding='STEIM2') print self.stFinal # convert optional boolean strings to boolean vars def toBool(self, value): """ Converts 'string' to boolean. Raises exception for invalid formats True values: 1, True, true, "1", "True", "true", "yes", "y", "t" False values: 0, False, false, "0", "False", "false", "no", "n", "f" """ if str(value).lower() in ("true", "yes", "t", "y", "1"): return True if str(value).lower() in ("false", "no", "f", "n", "0"): return False raise Exception('Invalid value for boolean conversion: ' + str(value))
def find_LFEs(filename, stations, tbegin, tend, TDUR, filt, \ freq0, nattempts, waittime, draw=False, type_threshold='MAD', \ threshold=0.0075): """ Find LFEs with the temporary stations from FAME using the templates from Plourde et al. (2015) Input: type filename = string filename = Name of the template type stations = list of strings stations = name of the stations used for the matched-filter algorithm type tebgin = tuplet of 6 integers tbegin = Time when we begin looking for LFEs type tend = tuplet of 6 integers tend = Time we stop looking for LFEs type TDUR = float TDUR = Time to add before and after the time window for tapering type filt = tuple of floats filt = Lower and upper frequencies of the filter type freq0 = float freq0 = Maximum frequency rate of LFE occurrence type nattempts = integer nattempts = Number of times we try to download data type waittime = positive float waittime = Type to wait between two attempts at downloading type draw = boolean draw = Do we draw a figure of the cross-correlation? type type_threshold = string type_threshold = 'MAD' or 'Threshold' type threshold = float threshold = Cross correlation value must be higher than that Output: None """ # Get the network, channels, and location of the stations staloc = pd.read_csv('../data/Ducellier/stations_permanent.txt', \ sep=r'\s{1,}', header=None, engine='python') staloc.columns = ['station', 'network', 'channels', 'location', \ 'server', 'latitude', 'longitude', 'time_on', 'time_off'] # Create directory to store the LFEs times namedir = 'LFEs/' + filename if not os.path.exists(namedir): os.makedirs(namedir) # File to write error messages namedir = 'error' if not os.path.exists(namedir): os.makedirs(namedir) errorfile = 'error/' + filename + '.txt' # Read the templates templates = Stream() for station in stations: data = pickle.load(open('templates_new/' + filename + \ '/' + station + '.pkl', 'rb')) if (len(data) == 3): EW = data[0] NS = data[1] UD = data[2] EW.stats.station = station NS.stats.station = station EW.stats.channel = 'E' NS.stats.channel = 'N' templates.append(EW) templates.append(NS) else: UD = data[0] UD.stats.station = station UD.stats.channel = 'Z' templates.append(UD) # Begin and end time of analysis t1 = UTCDateTime(year=tbegin[0], month=tbegin[1], \ day=tbegin[2], hour=tbegin[3], minute=tbegin[4], \ second=tbegin[5]) t2 = UTCDateTime(year=tend[0], month=tend[1], \ day=tend[2], hour=tend[3], minute=tend[4], \ second=tend[5]) # Read the data data = [] for station in stations: # Get station metadata for downloading for ir in range(0, len(staloc)): if (station == staloc['station'][ir]): network = staloc['network'][ir] channels = staloc['channels'][ir] location = staloc['location'][ir] server = staloc['server'][ir] # Duration of template template = templates.select(station=station, component='Z')[0] dt = template.stats.delta nt = template.stats.npts duration = (nt - 1) * dt Tstart = t1 - TDUR Tend = t2 + duration + TDUR delta = t2 + duration - t1 ndata = int(delta / dt) + 1 # Orientation of template # Date chosen: April 1st 2008 mychannels = channels.split(',') mylocation = location if (mylocation == '--'): mylocation = '' response = '../data/response/' + network + '_' + station + '.xml' inventory = read_inventory(response, format='STATIONXML') reference = [] for channel in mychannels: angle = inventory.get_orientation(network + '.' + \ station + '.' + mylocation + '.' + channel, \ UTCDateTime(2012, 1, 1, 0, 0, 0)) reference.append(angle) # First case: we can get the data from IRIS if (server == 'IRIS'): (D, orientation) = get_from_IRIS(station, network, channels, \ location, Tstart, Tend, filt, dt, nattempts, waittime, \ errorfile) # Second case: we get the data from NCEDC elif (server == 'NCEDC'): (D, orientation) = get_from_NCEDC(station, network, channels, \ location, Tstart, Tend, filt, dt, nattempts, waittime, \ errorfile) else: raise ValueError('You can only download data from IRIS and NCEDC') # Append data to stream if (type(D) == obspy.core.stream.Stream): stationdata = fill_data(D, orientation, station, channels, reference) if (len(stationdata) > 0): for stream in stationdata: data.append(stream) # Number of hours of data to analyze nhour = int(ceil((t2 - t1) / 3600.0)) # Create dataframe to store LFE times df = pd.DataFrame(columns=['year', 'month', 'day', 'hour', \ 'minute', 'second', 'cc', 'nchannel']) # Loop on hours of data for hour in range(0, nhour): nchannel = 0 Tstart = t1 + hour * 3600.0 Tend = t1 + (hour + 1) * 3600.0 + duration delta = Tend - Tstart ndata = int(delta / dt) + 1 # Loop on channels for channel in range(0, len(data)): # Cut the data subdata = data[channel] subdata = subdata.slice(Tstart, Tend) # Check whether we have a complete one-hour-long recording if (len(subdata) == 1): if (len(subdata[0].data) == ndata): # Get the template station = subdata[0].stats.station component = subdata[0].stats.channel template = templates.select(station=station, \ component=component)[0] # Cross correlation cctemp = correlate.optimized(template, subdata[0]) if (nchannel > 0): cc = np.vstack((cc, cctemp)) else: cc = cctemp nchannel = nchannel + 1 if (nchannel > 0): # Compute average cross-correlation across channels meancc = np.mean(cc, axis=0) if (type_threshold == 'MAD'): MAD = np.median(np.abs(meancc - np.mean(meancc))) index = np.where(meancc >= threshold * MAD) elif (type_threshold == 'Threshold'): index = np.where(meancc >= threshold) else: raise ValueError('Type of threshold must be MAD or Threshold') times = np.arange(0.0, np.shape(meancc)[0] * dt, dt) # Get LFE times if np.shape(index)[1] > 0: (time, cc) = clean_LFEs(index, times, meancc, dt, freq0) # Add LFE times to dataframe i0 = len(df.index) for i in range(0, len(time)): timeLFE = Tstart + time[i] df.loc[i0 + i] = [int(timeLFE.year), int(timeLFE.month), \ int(timeLFE.day), int(timeLFE.hour), \ int(timeLFE.minute), timeLFE.second + \ timeLFE.microsecond / 1000000.0, cc[i], nchannel] # Draw figure if (draw == True): params = {'xtick.labelsize':16, 'ytick.labelsize':16} pylab.rcParams.update(params) plt.figure(1, figsize=(20, 8)) if np.shape(index)[1] > 0: for i in range(0, len(time)): plt.axvline(time[i], linewidth=2, color='grey') plt.plot(np.arange(0.0, np.shape(meancc)[0] * dt, \ dt), meancc, color='black') if (type_threshold == 'MAD'): plt.axhline(threshold * MAD, linewidth=2, color='red', \ label = '{:6.2f} * MAD'.format(threshold)) elif (type_threshold == 'Threshold'): plt.axhline(threshold, linewidth=2, color='red', \ label = 'Threshold = {:8.4f}'.format(threshold)) else: raise ValueError( \ 'Type of threshold must be MAD or Threshold') plt.xlim(0.0, (np.shape(meancc)[0] - 1) * dt) plt.xlabel('Time (s)', fontsize=24) plt.ylabel('Cross-correlation', fontsize=24) plt.title('Average cross-correlation across stations', \ fontsize=30) plt.legend(loc=2, fontsize=24) plt.savefig('LFEs/' + filename + '/' + \ '{:04d}{:02d}{:02d}_{:02d}{:02d}{:02d}'.format( \ Tstart.year, Tstart.month, Tstart.day, Tstart.hour, \ Tstart.minute, Tstart.second) + '.png', format='png') plt.close(1) # Add to pandas dataframe and save namefile = 'LFEs/' + filename + '/catalog.pkl' if os.path.exists(namefile): df_all = pickle.load(open(namefile, 'rb')) df_all = pd.concat([df_all, df], ignore_index=True) else: df_all = df df_all = df_all.astype(dtype={'year':'int32', 'month':'int32', \ 'day':'int32', 'hour':'int32', 'minute':'int32', \ 'second':'float', 'cc':'float', 'nchannel':'int32'}) pickle.dump(df_all, open(namefile, 'wb'))
def analyze_data(families, staloc, tbegin, tend, \ freq0, type_threshold, threshold, ncpu, icpu): """ """ nfamilies = int(ceil(len(families) / ncpu)) ibegin = icpu * nfamilies iend = min((icpu + 1) * nfamilies, len(families)) # Loop on families for i in range(ibegin, iend): # Create directory to store the LFEs times namedir = 'LFEs/' + families['family'].iloc[i] if not os.path.exists(namedir): os.makedirs(namedir) # File to write number of stations namedir = 'nstations' if not os.path.exists(namedir): os.makedirs(namedir) stationfile = 'nstations/' + families['family'].iloc[i] + '.txt' # Create dataframe to store LFE times df = pd.DataFrame(columns=['year', 'month', 'day', 'hour', \ 'minute', 'second', 'cc', 'nchannel']) # Read the templates stations = families['stations'].iloc[i].split(',') templates = Stream() orientations = [] names = [] for station in stations: subset = staloc.loc[staloc['station'] == station] channels = subset['channels'].iloc[0] mychannels = channels.split(',') for channel in mychannels: data = pickle.load(open(template_dir + '/' + \ families['family'].iloc[i] + '/' + station + '_' + \ channel + '.pkl', 'rb')) template = data[0] angle = data[1] templates.append(template) orientations.append(angle) names.append(station + '_' + channel) # Check the time step of the stations subset = staloc.loc[staloc['station'].isin(stations)] if len(subset['dt'].value_counts()) == 1: dt = subset['dt'].iloc[0] else: raise ValueError('All stations must have the same time step') # Number of hours of data to analyze t1 = UTCDateTime(year=tbegin[0], month=tbegin[1], \ day=tbegin[2], hour=tbegin[3], minute=tbegin[4], \ second=tbegin[5]) t2 = UTCDateTime(year=tend[0], month=tend[1], \ day=tend[2], hour=tend[3], minute=tend[4], \ second=tend[5]) nhour = int(ceil((t2 - t1) / 3600.0)) duration = families['duration'].iloc[i] # To rotate components swap = {'E': 'N', 'N': 'E', '1': '2', '2': '1'} # Loop on hours of data for hour in range(0, nhour): Tstart = t1 + hour * 3600.0 Tend = t1 + (hour + 1) * 3600.0 + duration delta = Tend - Tstart ndata = int(delta / dt) + 1 # Get the data data = [] for station in stations: subset = staloc.loc[staloc['station'] == station] channels = subset['channels'].iloc[0] mychannels = channels.split(',') for num, channel in enumerate(mychannels): try: D = read('tmp/' + station + '_' + channel + '.mseed') D = D.slice(Tstart, Tend) if (type(D) == obspy.core.stream.Stream): namefile = 'tmp/' + station + '_' + channel + \ '.pkl' orientation = pickle.load(open(namefile, 'rb')) \ [num] index = names.index(station + '_' + channel) reference = orientations[index] # Rotate components if (len(mychannels) > 1) and (num < 2): if orientation != reference: channel_new = channel[0:2] + \ swap[channel[2]] D_new = read('tmp/' + station + '_' + \ channel_new + '.mseed') D_new = D_new.slice(Tstart, Tend) namefile = 'tmp/' + station + '_' + \ channel_new + '.pkl' if num == 0: orientation_new = pickle.load(open( \ namefile, 'rb'))[1] else: orientation_new = pickle.load(open( \ namefile, 'rb'))[0] index = names.index(station + '_' + \ channel_new) reference_new = orientations[index] if channel[2] in ['E', '1']: D = rotate_data(D, D_new, \ orientation, orientation_new, \ reference, reference_new, 'E') else: D = rotate_data(D_new, D, \ orientation_new, orientation, \ reference_new, reference, 'N') # Append stream to data data.append(D) except: message = 'No data available for station {}'.format( \ station) + ' and channel {}'.format(channel) + \ ' at time {}/{}/{} - {}:{}:{}\n'.format( \ Tstart.year, Tstart.month, Tstart.day, \ Tstart.hour, Tstart.minute, Tstart.second) # Loop on channels nchannel = 0 for j in range(0, len(data)): subdata = data[j] # Check whether we have a complete one-hour-long recording if (len(subdata) == 1): if (len(subdata[0].data) == ndata): # Get the template station = subdata[0].stats.station channel = subdata[0].stats.channel template = templates.select(station=station, \ channel=channel)[0] # Cross correlation cctemp = correlate.optimized(template, subdata[0]) if (nchannel > 0): cc = np.vstack((cc, cctemp)) else: cc = cctemp nchannel = nchannel + 1 # Write number of channels with open(stationfile, 'a') as file: file.write('{} {} {} {} {}\n'.format(Tstart.year, \ Tstart.month, Tstart.day, Tstart.hour, nchannel)) if (nchannel > 0): # Compute average cross-correlation across channels if len(np.shape(cc)) == 1: meancc = cc else: meancc = np.mean(cc, axis=0) if (type_threshold == 'MAD'): MAD = np.median(np.abs(meancc - np.mean(meancc))) index = np.where(meancc >= threshold * MAD) elif (type_threshold == 'Threshold'): index = np.where(meancc >= threshold) else: raise ValueError( \ 'Type of threshold must be MAD or Threshold') times = np.arange(0.0, np.shape(meancc)[0] * dt, dt) # Get LFE times if np.shape(index)[1] > 0: (time, cc) = clean_LFEs(index, times, meancc, dt, freq0) # Add LFE times to dataframe i0 = len(df.index) for j in range(0, len(time)): timeLFE = Tstart + time[j] df.loc[i0 + j] = [int(timeLFE.year), \ int(timeLFE.month), int(timeLFE.day), \ int(timeLFE.hour), int(timeLFE.minute), \ timeLFE.second + timeLFE.microsecond / 1000000.0, \ cc[j], nchannel] # Add to pandas dataframe and save namefile = 'LFEs/' + families['family'].iloc[i] + '/catalog.pkl' if os.path.exists(namefile): df_all = pickle.load(open(namefile, 'rb')) df_all = pd.concat([df_all, df], ignore_index=True) else: df_all = df df_all = df_all.astype(dtype={'year':'int32', 'month':'int32', \ 'day':'int32', 'hour':'int32', 'minute':'int32', \ 'second':'float', 'cc':'float', 'nchannel':'int32'}) pickle.dump(df_all, open(namefile, 'wb'))
class GetIIData(object): # initialize input vars def __init__(self, year, startday, network, **kwargs): # initialize year/start/net # if statement to check for main args set QUERY=True # else sys.exit(1) if (year != "") and (startday != "") and (network != ""): self.year = year self.startday = startday self.network = network QUERY = True else: QUERY = False # loop through **kwargs and initialize optargs self.endday = "" # init endday string self.station = "" # init station string self.location = "" # init location string self.channel = "" # init channel string self.debug = False # init debug self.archive = False # init archive endday = self.endday for key, val in kwargs.iteritems(): if key == "endday": self.endday = val elif key == "station": self.station = val elif key == "location": self.location = val elif key == "channel": self.channel = val elif key == "debug": self.debug = self.toBool(val) elif key == "archive": self.archive = self.toBool(val) # print arguments if 'debug' mode if self.debug: print "Year: " + self.year print "Start Day: " + self.startday print "End Day: " + self.endday print "Network: " + self.network print "Station: " + self.station print "Location: " + self.location print "Channel: " + self.channel # handle wildcards if self.location == "?": self.location = "*" if self.channel == "?": self.channel = "*" if self.station == "?": self.station = "*" # set start/end to UTCDateTime object #-------------------------------------------------------------------- self.startTime = UTCDateTime(year + startday + "T00:00:00.000") # If no end day in parser default to 1 day if self.endday == "?": self.endday = str(int(self.startday) + 1).zfill(3) self.endTime = self.startTime + 24 * 60 * 60 else: self.endTime = UTCDateTime(year + self.endday + "T00:00:00.000") print "Here is our start time: " + self.startTime.formatIRISWebService( ) print "Here is our end time: " + self.endTime.formatIRISWebService() self.days = int(self.endday) - int(self.startday) # there are 24, 1 hour increments in a day self.hours = (int(self.endday) - int(self.startday)) * 24 # Will only run if main args are given # check QUERY flag if True continue if QUERY: self.queryData() else: print '\nNo main args given.' print 'Exiting\n' sys.exit(1) def queryData(self): # code from IRIS client # Here we pull the data client = Client("IRIS") DupStations = [] DupLocations = [] DupChannels = [] self.st = Stream() self.STAWILD = False self.LOCWILD = False self.CHANWILD = False try: timeout = 300 socket.setdefaulttimeout(timeout) # this needs to have a get_waveform that queries data 1 hour at a time # data cant query right now if the data is too bulky # also needs to include a timeout exception for hourIndex in range( 0, self.hours): #this cant be days... has to be hours self.startTime1 = self.startTime + (hourIndex) * 1 * 60 * 60 self.endTime1 = self.startTime + (hourIndex + 1) * 1 * 60 * 60 requestArray = [(self.network,self.station,self.location, \ self.channel,self.startTime1,self.endTime1)] self.st1 = client.get_waveforms_bulk(requestArray) self.st += self.st1 print self.st print #self.st = client.get_waveforms_bulk(timeout=10,requestArray) for self.tr in self.st: #Here we remove the M data quality and go with D self.tr.stats.mseed['dataquality'] = 'D' if self.debug: if self.station == '*': self.STAWILD = True DupStations.append(self.tr.stats.station) elif self.station != '*': self.STAWILD = False if self.location == '*': self.LOCWILD = True DupLocations.append(self.tr.stats.location) elif self.location != '*': self.LOCWILD = False if self.channel == '*': self.CHANWILD = True DupChannels.append(self.tr.stats.channel) elif self.channel != '*': self.CHANWILD = False #except TimeoutError: #print 'Get waveform timeout, exiting...' #sys.exit(0) except: print 'Trouble getting data' sys.exit(0) # Takes duplicate stations out of list and # makes station, location, and channel into an array # for looping( probably easier way but it works) self.stations = list(set(DupStations)) if self.station != '*': self.stations.append(self.station) self.locations = list(set(DupLocations)) if self.location != '*': self.locations.append(self.location) self.channels = list(set(DupChannels)) if self.channel != '*': self.channels.append(self.channel) print print "Station(s) being pulled: " + str(self.stations) print "Location(s) being pulled: " + str(self.locations) print "Channel(s) being pulled: " + str(self.channels) # Now call code to store streams in mseed files self.storeMSEED() def storeMSEED(self): #Main program #code for storing MSEED files codepath = '/home/mkline/dev/getIIdataBackup/TEST_ARCHIVE/' self.stFinal = Stream() for self.channel in self.channels: self.trace2 = self.st.select(channel=self.channel) for self.location in self.locations: self.trace1 = self.trace2.select(location=self.location) for self.station in self.stations: print print "For station, location, and channel: " \ + self.station +" "+ self.location +" "+ self.channel trace = self.trace1.select(station=self.station) trace.merge() trace.sort() trace.count() for dayIndex in range(0, self.days): print "Day properties: " #startTime works better than trace[0].stats.starttime trimStart = self.startTime + (dayIndex) * 24 * 60 * 60 trimEnd = self.startTime + (dayIndex + 1) * 24 * 60 * 60 print "Start of day: " + str(trimStart) print "End of day: " + str(trimEnd) #Converting date into julian day to store in directory timesplit = re.split('T', str(trimStart)) s = timesplit[0] fmt = '%Y-%m-%d' dt = datetime.datetime.strptime(s, fmt) tt = dt.timetuple() NewStartDay = str(tt.tm_yday).zfill(3) self.stFinal = trace.copy() self.stFinal.trim(starttime=trimStart, endtime=trimEnd) # This if statement is used to make sure traces with no # data dont get added to the directory structure if not self.stFinal or str( self.stFinal[0].max()) == '--': print "No trace for given day" else: #Added the directory structures in here since you won't want to #add directory structures that you don't use self.stFinal = self.stFinal.split() if not os.path.exists(codepath + self.network + '_' + self.station + '/'): os.mkdir(codepath + self.network + '_' + self.station + '/') if not os.path.exists(codepath + self.network + '_' + self.station + '/' \ + self.year + '/'): os.mkdir(codepath + self.network + '_' + self.station + '/' \ + self.year + '/') stpath = codepath + self.network + '_' + self.station + '/' + self.year + \ '/' + self.year + '_' + NewStartDay + '/' if not os.path.exists(stpath): os.mkdir(stpath) # Here we write the data using STEIM 2 and 512 record lengths self.stFinal.write(stpath + self.stFinal[0].stats.location + '_' + \ self.stFinal[0].stats.channel + '.512.seed', format='MSEED', \ reclen = 512, encoding='STEIM2') print self.stFinal # convert optional boolean strings to boolean vars def toBool(self, value): """ Converts 'string' to boolean. Raises exception for invalid formats True values: 1, True, true, "1", "True", "true", "yes", "y", "t" False values: 0, False, false, "0", "False", "false", "no", "n", "f" """ if str(value).lower() in ("true", "yes", "t", "y", "1"): return True if str(value).lower() in ("false", "no", "f", "n", "0"): return False raise Exception('Invalid value for boolean conversion: ' + str(value))