def simple_edss(edss): ''' Use only a few columns so that we don't make 21*20 coherence pairs ''' all_channels = util_funcs.get_common_channel_names() subset_channels = [all_channels.index(channel) for channel in complex_feature_channels] return [(datum[0][:, subset_channels], datum[1]) for datum in edss]
def use_all_channels_for_coherence_detect_knn(): complex_feature_channels = util_funcs.get_common_channel_names() #returns all the channels train_pkl="/home/msaqib/trainSeizureData_expanded.pkl" valid_pkl="/home/msaqib/validSeizureData_expanded.pkl" test_pkl="/home/msaqib/testSeizureData_expanded.pkl" random_under_sample_data_gen = True use_simple_hand_engineered_features = False
def use_lstm(): discretize_age = False output_size = 1 use_simple_lstm = True kbins_encoding = "onehot-dense" window = 5 early_stopping = True patience = 10 # variable batch, variable time steps, but constant num features input_shape = (None, None, (len(read.EdfFFTDatasetTransformer.freq_bins) - 1) * len(util_funcs.get_common_channel_names()))
def run_prep(file_name, annotation, split="train"): data = mne.io.read_raw_edf(file_name, preload=True) data = data.pick_channels(util_funcs.get_common_channel_names()) # use the 21 channels guaranteed in each sample data = data.reorder_channels(util_funcs.get_common_channel_names()) data.rename_channels(constants.MNE_CHANNEL_EDF_MAPPING) data.resample(512) #upsample to highest frequency, as per best practice data.set_eeg_reference() data.set_montage("standard_1020") data.filter(1, 50) montage_kind = "standard_1020" maxTime = annotation.index.max()/pd.Timedelta(seconds=1) montage = mne.channels.make_standard_montage(montage_kind) ref, patient, session, token = read.parse_edf_token_path_structure(file_name) # for i in range(int(maxTime/2)): basePath = f"/n/scratch2/ms994/medium_size/{split}/{patient}/{session}/{token}/" Path(basePath).mkdir( parents=True, exist_ok=True) shutil.copyfile(file_name[:-4]+".tse", f"{basePath}label.tse") shutil.copyfile(file_name[:-4]+".lbl", f"{basePath}montage.lbl") shutil.copyfile(file_name[:-9]+".txt", f"{basePath}notes.txt") dataDict = Dict() for i in range(int(maxTime/2) - 1): croppedData = data.copy().crop(i*2, i*2 + 4) croppedData.resample(constants.COMMON_FREQ) #resample to minimum dataDict[i].index = i dataDict[i].data = croppedData dataDict[i].start = i*2 dataDict[i].end = i*2 + 4 if (i % 500 == 499): # save up to 500 separate data segments at a time to avoid IO bottleneck in scratch2, but also to avoid creating any pickle that is too big to parse_edf_token_path_structure pickle.dump(dataDict, open(basePath+f"intermediate_{int(np.ceil(i/500))}", "wb")) dataDict = Dict() pickle.dump(dataDict, open(basePath+f"intermediate_{int(np.ceil(i/500))}", "wb")) print(f"COMPLETED {file_name}")
def __init__( self, data_split, ref, num_files=None, resample=pd.Timedelta(seconds=constants.COMMON_DELTA), start_offset=pd.Timedelta( seconds=0), #start at 0 unless if we want something different max_length=None, expand_tse=False, #Save memory, don't try to make time by annotation df dtype=np.float32, n_process=None, use_average_ref_names=True, filter=True, lp_cutoff=1, hp_cutoff=50, #get close to nyq without actually hitting it order_filt=5, columns_to_use=util_funcs.get_common_channel_names(), use_numpy=False, specific_seiz_types=None): self.data_split = data_split if n_process is None: n_process = mp.cpu_count() self.n_process = n_process self.ref = ref self.resample = resample self.dtype = dtype self.start_offset = start_offset self.max_length = max_length self.manager = mp.Manager() self.edf_tokens = get_all_token_file_names(data_split, ref) self.specific_seiz_types = specific_seiz_types if self.specific_seiz_types is not None: util_funcs.g self.expand_tse = expand_tse self.use_average_ref_names = use_average_ref_names if num_files is not None: self.edf_tokens = self.edf_tokens[0:num_files] self.filter = filter self.hp_cutoff = hp_cutoff self.lp_cutoff = lp_cutoff self.order_filt = order_filt self.columns_to_use = columns_to_use self.use_numpy = use_numpy
def __init__(self, edfRawData, n_process=None, coherence_all=True, coherence_pairs=None, average_coherence=True, coherence_bin=None, columns_to_use=util_funcs.get_common_channel_names(), is_pandas=True, is_tuple_data=True): """ Parameters ---------- edfRawData : DataFrame An array-like holding the data for coherence n_process : int number of processes to use when indexing a slice coherence_all : bool If to do pair-wise coherence on all channels, if so we increase num features to n*n-1 coherence_pairs : list If coherence_all is false, pass in a list of tuples holding columns to run coherence measurements on average_coherence : bool If true, just do an average of all coherences over all represented frequencies. If False, use coherence_bin to histogram bin everything Returns ------- CoherenceTransformer Array-like """ self.edfRawData = edfRawData self.n_process = n_process self.is_pandas = is_pandas self.coherence_all = coherence_all self.coherence_pairs = coherence_pairs self.average_coherence = average_coherence self.coherence_bin = coherence_bin self.columns_to_use = columns_to_use self.is_pandas = is_pandas self.is_tuple_data = is_tuple_data
def get_random_channel_ordering(): channel_ordering = [ i for i in range(len(util_funcs.get_common_channel_names())) ] np.random.shuffle(channel_ordering) return channel_ordering
def use_all_columns(): columns_to_use = util_funcs.get_common_channel_names()
def __init__( self, data_split, ref, num_files=None, resample=pd.Timedelta( seconds=constants.COMMON_DELTA), max_length=pd.Timedelta(seconds=4), expand_tse=False, #Save memory, don't try to make time by annotation df dtype=np.float32, n_process=None, use_average_ref_names=True, filter=True, lp_cutoff=1, hp_cutoff=50, #get close to nyq without actually hitting it to avoid errors order_filt=5, columns_to_use=util_funcs.get_common_channel_names(), use_numpy=True, ensemble_mode=RANDOM_SAMPLE_ENSEMBLE, max_num_samples=20, file_lengths=None, #automatically populated if not given edf_tokens=None, labels=None, # labels that map to edf token level generate_sample_info=True ): if labels is not None: assert len(labels) == len(edf_tokens) self.data_split = data_split if n_process is None: n_process = mp.cpu_count() self.n_process = n_process self.ref = ref self.resample = resample self.dtype = dtype if (type(max_length) == int): max_length = max_length * pd.Timedelta(seconds=pd.Timedelta(constants.COMMON_DELTA)) self.max_length = max_length self.manager = mp.Manager() if edf_tokens is None: self.edf_tokens = read.get_all_token_file_names(data_split, ref) else: self.edf_tokens = edf_tokens self.expand_tse = expand_tse self.use_average_ref_names = use_average_ref_names if num_files is not None: self.edf_tokens = self.edf_tokens[0:num_files] self.filter = filter self.hp_cutoff = hp_cutoff self.lp_cutoff = lp_cutoff self.order_filt = order_filt self.columns_to_use = columns_to_use self.use_numpy = use_numpy self.ensemble_mode = ensemble_mode self.max_num_samples = max_num_samples if file_lengths is None: file_lengths = util_funcs.get_file_sizes(data_split, ref) self.file_lengths=file_lengths self.labels = labels self.sampleInfo=Dict() if generate_sample_info: self.generateSampleInfo()
def __init__( self, segment_file_tuples, columns_to_use=util_funcs.get_common_channel_names(), use_numpy=True, lp_cutoff=1, hp_cutoff=50, random_under_sample=False, order_filt=5, include_seizure_type=False, mode=DETECT_MODE, resample=pd.Timedelta(seconds=constants.COMMON_DELTA), # num_splits_per_sample= None, gap = pd.Timedelta(seconds=1), # overlap = None, return_sequence_label=False, num_samples=None, max_bckg_samps_per_file=None, overlapping_augmentation=False, n_process=4, include_montage_channels=False, # which montage channels have seizure include_segment=False, shuffle = True, ): self.mode = mode self.n_process = n_process self.resample = resample self.segment_file_tuples = segment_file_tuples self.columns_to_use = columns_to_use self.use_numpy = use_numpy self.lp_cutoff = lp_cutoff self.hp_cutoff = hp_cutoff self.order_filt = order_filt self.sampleInfo = Dict() self.gap = gap # if overlap = None: # self.overlap = gap # else: # self.overlap = overlap self.include_seizure_type = include_seizure_type self.num_samples = num_samples self.return_sequence_label = False self.random_under_sample = random_under_sample self.overlapping_augmentation = overlapping_augmentation self.include_montage_channels = include_montage_channels self.include_segment = include_segment # self.num_splits_per_sample = num_splits_per_sample currentIndex = 0 for token_file_path, segment in self.segment_file_tuples: if shuffle: segment = segment.reindex(np.random.permutation(segment.index)) #randomly sample from each eeg file num_bckg_samps_per_file = 0 for time_period, label in segment.iteritems(): # segment = segment.resample(gap).mode() #if gap isn't correct size, just resample if num_samples is not None and currentIndex >= self.num_samples: break if max_bckg_samps_per_file is not None and num_bckg_samps_per_file >= max_bckg_samps_per_file and label == "bckg": continue if (label != "bckg" and "sz" not in label and self.mode == EdfDatasetSegmentedSampler.DETECT_MODE): continue #go to next, too close to seizure to be safe if (label == "postsz" or label == "presz"): continue # for split_num in range(num_splits_per_sample): if self.mode == EdfDatasetSegmentedSampler.DETECT_MODE: self.sampleInfo[currentIndex].label = ("sz" in label) if (label != "bckg" and label != "sample" and self.mode == EdfDatasetSegmentedSampler.PREDICT_MODE): continue #go to next, too close to seizure to be safe or is seizure, we don't want to deal with this if self.mode == EdfDatasetSegmentedSampler.PREDICT_MODE: self.sampleInfo[currentIndex].label = (label == "sample") if label == "bckg": num_bckg_samps_per_file += 1 if self.include_seizure_type: self.sampleInfo[currentIndex].label = (self.sampleInfo[currentIndex].label, label) #attach the specific label on the EDSS self.sampleInfo[currentIndex].token_file_path = token_file_path if self.include_segment: self.sampleInfo[currentIndex].label = (*self.sampleInfo[currentIndex].label, self.sampleInfo[currentIndex].token_file_path) self.sampleInfo[currentIndex].sample_num = (time_period) / self.gap self.sampleInfo[currentIndex].sample_width = self.gap currentIndex += 1 if self.random_under_sample: self.balance()
def edf_eeg_2_df(path, resample=None, dtype=np.float32, start=0, filter=True, max_length=None): """ Transforms from EDF to pd.df, with channel labels as columns. This does not attempt to concatenate multiple time series but only takes a single edf filepath Parameters ---------- path : str path of the edf file resample : pd.Timedelta if None, returns original data with original sampling otherwise, resamples to correct Timedelta using forward filling dtype : dtype used to reduce memory consumption (np.float64 can be expensive) start : int or pd.Timedelta which place to start at Returns ------- pd.DataFrame index is time, columns is waveform channel label """ global file_list, file_list_lock waiting_for_path = True while waiting_for_path: #hack around pyedflib having access to only one file handle at a time, if file is open, don't do anything file_list_lock.acquire() if path not in file_list: file_list.add(path) waiting_for_path = False file_list_lock.release() with pyedflib.EdfReader( path, check_file_size=pyedflib.CHECK_FILE_SIZE) as reader: channel_names = [ headerDict['label'] for headerDict in reader.getSignalHeaders() ] sample_rates = [ headerDict['sample_rate'] for headerDict in reader.getSignalHeaders() ] for headerDict in reader.getSignalHeaders(): if headerDict["dimension"] != "uV" and headerDict[ "label"] in util_funcs.get_common_channel_names(): raise Exception() start_time = pd.Timestamp(reader.getStartdatetime()) all_channels = [] for i, channel_name in enumerate(channel_names): if type( start ) == pd.Timedelta: #we ask for time t=1 s, then we take into account sample rate start_count_native_freq = start / pd.Timedelta(seconds=1 / sample_rates[i]) else: start_count_native_freq = start if max_length is None: #read everything signal_data = reader.readSignal(i, start=start_count_native_freq) else: numStepsToRead = int( np.ceil( max_length / pd.Timedelta(seconds=1 / sample_rates[i])) ) + 5 #adding a fudge factor of 5 for any off by 1 errors if "messy_read_outputs" in read_config() and read_config( )["messy_read_outputs"]: sys.stdout = open(os.devnull, "w") signal_data = reader.readSignal(i, start=start_count_native_freq, n=numStepsToRead) if "messy_read_outputs" in read_config() and read_config( )["messy_read_outputs"]: sys.stdout = sys.__stdout__ signal_data = pd.Series(signal_data, index=pd.date_range( start=start_time, freq=pd.Timedelta(seconds=1 / sample_rates[i]), periods=len(signal_data)), name=channel_name) all_channels.append(signal_data) data = pd.concat(all_channels, axis=1) data.index = data.index - data.index[0] data = data.astype(dtype) if filter is not None: segSize = data.index[1] - data.index[0] data.apply(lambda col: filters.butter_bandpass_filter( col, lowcut=1, highcut=50, fs=pd.Timedelta(seconds=1) / segSize, order=5), axis=0) if resample is not None: data = data.resample(resample).mean() waiting_for_path = True file_list_lock.acquire() file_list.remove(path) file_list_lock.release() return data