def main(kind, file): logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) mne.set_log_level(logging.ERROR) if file is not '': _, ext = os.path.splitext(file) file = files_builder(ext=ext).single_file(file) interactive_plot(file) return logger.info(f'Plotting EEG singals of kind {kind}.') for file in files_builder(kind): interactive_plot(file)
def create_training_data(output_path, kind, window=None, minl=0, maxl=None, existing_df=None): """Create a dataframe with features and labels suitable for training.""" logging.info('Creating training data.') cols = pd.MultiIndex.from_product([CHANNEL_NAMES, algos.measure_names], names=['channel', 'measure']) if window is not None: idxs = pd.MultiIndex.from_product( [list(range(1, 134)), ['a', 'b'], [0]], names=['patient', 'trial', 'window']) else: idxs = pd.MultiIndex.from_product([list(range(1, 134)), ['a', 'b']], names=['patient', 'trial']) main_df = pd.DataFrame(columns=cols, index=idxs) if existing_df is None \ else existing_df for file in files_builder(kind): if main_df.loc[(file.id, file.trial)].isnull().values.any(): new_row = compute_nl(file.df, window, minl, maxl) main_df.loc[(file.id, file.trial)] = pd.Series(new_row) logging.debug("New row: \n%s" % new_row) logging.debug(f'Saving training data at {output_path}.') logging.info(f'Processed file {file.number}') main_df.to_pickle(output_path) else: logging.debug(f"Skipping row ({file.id}, {file.trial})") continue
def create_surrogates(): for file in files_builder(DataKind.PROCESSED): surr_df = pd.DataFrame().reindex_like(file.df) for col in file.df.columns: surr_df[col] = iaaft( file.df[col], maxiter=1000, atol=1e-8, rtol=1e-10)[0] file_name = os.path.splitext(file.name)[0] + '.csv' file_path = os.path.join(SURROGATE_ROOT, file_name) surr_df.to_csv(file_path, sep='\t')
def preprocess_all(output_file=PROCESSED_ROOT): for file in files_builder(DataKind.RAW): mne_raw_data = files_builder(DataKind.MNE, file=file) try: mne_raw_data = preprocess_raw_mne_file(mne_raw_data) except ValueError: # Raised when duration is < 60 s, we may safely skip the file logging.debug(f'Skipping file {file.name} because of', 'insufficient duration.') continue processed_file_name = os.path.splitext(file.name)[0] + '.fif' mne_raw_data.save(os.path.join(output_file, processed_file_name), proj=False, overwrite=True) return mne_raw_data
def compute_nl_measure(model_path, measure='lyap'): idxs = pd.MultiIndex.from_product([list(range(1, 134)), ['a', 'b']], names=['patient', 'trial']) cols = ['channel_n', 'feature', 'value', 'layer', 'unit', 'trial'] df = pd.DataFrame(index=idxs, columns=cols) df = df.astype({ 'channel': str, 'value': float, 'layer': int, 'unit': int, 'filter': int, }) model = k.load_model(model_path) sizes, strides = receptive_field_sizes_and_strides(model) # contain model name, feature model_name, _ = splitext(split(model_path)[0]) measures_path = join(CORRS_ROOT, '_'.join((model_name, measure))) for alg in algos.registered_algos: if alg.algo_name == 'measure': mem_alg = Memoize(alg) break else: raise Exception(f'Algorithm {measure} not registered.') for file in files_builder(DataKind.PROCESSED): idx = file.id trial = file.trial # Do the same TS processing we did to construct the model input logging.info(f'Trial {idx}-{trial}...') for channel in CHANNEL_NAMES: data = preprocess(file.df[channel]) # Compute values for each start end layer logging.info(f'Channel {channel}...') for layer_n, layer in enumerate(model.layers): shape = layer.shape filters = range(shape[-1]) units = range(shape[1]) logging.info(f'Layer {layer_n} with {shape[-1]+1} ' 'filters, each {shape[1]+1} units...') # for filter_n, unit_n in product(filters, units): for unit_n in units: start, end = get_rf_start_end(sizes, strides, unit_n, layer_n) value = mem_alg(data, start, end) df.loc[(idx, trial), 'channel'] = channel df.loc[(idx, trial), 'value'] = value df.loc[(idx, trial), 'layer'] = layer_n df.loc[(idx, trial), 'unit'] = unit_n # df.loc[(idx, trial), 'filter'] = filter_n df.to_pickle(measures_path)
def interactive_plot(file): """Create an interactive figure visualizing all channels from a file.""" def plot_mne_file(mne_file): fig = mne_file.plot(block=True, scalings='auto') fig.set_size_inches(18.5, 10.5, forward=True) data = files_builder(DataKind.MNE, file=file) plot_mne_file(data) logging.info(f'Plotting file {file.name} of kind={file.kind}, ' f'sfreq={data.info["sfreq"]}.') if file.kind == DataKind.RAW: logging.info(f'Plotting processed file {file.name}.') processed = preprocess_raw_mne_file(data) plot_mne_file(processed)
def compute_correlations(model_path, measures_path): """ Compute pairs of (measure value, unit activation) for each unit in each layer, compute their correlation for each unit in each layer. """ df = df.load_pickle(measures_path) # We will will have to use 9*2*num_units... Max num of units is # 5000 -> 9*2*4*5000*4 = 1.44 GB :/ idx = pd.MultiIndex(levels=[[]] * 4, labels=[[]] * 4, names=['l', 'h', 'w', 'f']) cols = ['activation', 'value'] df = pd.DataFrame(index=idx, columns=cols) for file in files_builder(DataKind.DIRECT): for channel_n, channel in enumerate(CHANNEL_NAMES): # TODO Compare each channel's value with the activation, # save the pair model = rebuild_model(model_path, channel_n) out = model(file.df[channel])
def create_sigma_pkl(in_df, kind, output_path): cols = pd.MultiIndex.from_product([CHANNEL_NAMES, algos.measure_names], names=['channel', 'measure']) idxs = pd.MultiIndex.from_product([list(range(1, 134)), ['a', 'b']], names=['patient', 'trial']) main_df = pd.DataFrame(columns=cols, index=idxs) for measure_name in in_df.columns.levels[1]: for algo in algos.registered_algos: if measure_name == algo.algo_name: func = algo break else: logging.warning( f'Algorithm for measure {measure_name} not regitered, skipping' ) continue for row in in_df.iterrows(): index = row[0][0] trial = row[0][1] file = files_builder(DataKind(kind)).from_index_trial(index, trial) assert ((file.id, file.trial) == (index, trial)) for channel_name in CHANNEL_NAMES: true_stat = in_df.loc[(file.id, file.trial), channel_name] try: # TODO: Choose if the time series is to be shortened time_series = file.df.loc[:, channel_name] except IndexError: logging.info('Caught index error, skipping...') break sigma = compute_sigma(time_series, true_stat, func) new_row = {(channel_name, measure_name): sigma} main_df.loc[(file.id, file.trial)] = pd.Series(new_row) logging.debug("New row: \n%s" % new_row) logging.debug(f'Saving training data at {output_path}.') logging.info(f'Processed file {file.number}') main_df.to_pickle(output_path)
def create_meta_df(output_path): def _three_class_label(col_extra, col_meta): score = extra_df.loc[(index, trial)][col_extra] if isinstance(col_meta, tuple): q1 = np.quantile(meta_df[col_meta[0]].append(meta_df[col_meta[1]]), 0.33) q2 = np.quantile(meta_df[col_meta[0]].append(meta_df[col_meta[1]]), 0.66) else: q1 = meta_df.quantile(0.33)[col_meta] q2 = meta_df.quantile(0.66)[col_meta] if score <= q1: dep = -1 elif score <= q2: dep = 0 else: dep = 1 return dep logging.info('Creating dataframe with the meta information.') cols = [ 'resp', 'b/a', 'sex', 'age', 'sfreq', 'sc', 'sc_bef', 'sc_aft', 'dep', 'dep_bef', 'dep_aft', 'change' ] idxs = pd.MultiIndex.from_product([list(range(1, 134)), ['a', 'b']], names=['patient', 'trial']) extra_df = pd.DataFrame(columns=cols, index=idxs) meta_df = files_builder(DataKind.META) for file in files_builder(DataKind.PROCESSED): index = file.id trial = file.trial meta_row = meta_df.loc[index, :] extra_df.loc[(index, trial)]['resp'] = meta_row['RESP_4W'] extra_df.loc[(index, trial)]['b/a'] = 0 if trial == 'a' else 1 extra_df.loc[(index, trial)]['age'] = meta_row['AGE'] extra_df.loc[(index, trial)]['sex'] = meta_row['SEX'] extra_df.loc[(index, trial)]['sfreq'] = meta_row['freq'] m1 = meta_row['M_1'] m4 = meta_row['M_4'] extra_df.loc[(index, trial)]['sc'] = m1 if trial == 'a' else m4 extra_df.loc[(index, trial)]['sc_bef'] = m1 extra_df.loc[(index, trial)]['sc_aft'] = m4 extra_df.loc[(index, trial)]['change'] = m1 / m4 extra_df.loc[(index, trial)]['dep'] = \ _three_class_label('sc', ('M_1', 'M_4')) extra_df.loc[(index, trial)]['dep_bef'] = \ _three_class_label('sc_bef', 'M_1') extra_df.loc[(index, trial)]['dep_aft'] = \ _three_class_label('sc_aft', 'M_4') logging.debug('Added row: \n{}'.format(extra_df.loc[(index, trial)])) extra_df = extra_df.astype({ 'resp': 'category', 'b/a': 'category', 'sex': 'category', 'sfreq': int, 'age': int, 'sc': float, 'sc_bef': int, 'sc_aft': int, 'dep': int, 'dep_bef': int, 'dep_aft': int, 'change': float }) logging.debug('The resulting data: \n{}'.format(extra_df.describe())) logging.debug(f'Saving metadata dataframe at {output_path}.') extra_df.to_pickle(output_path) output_folder = os.path.sep.join(output_path.split(os.sep)[:-1]) measures_file = os.path.join(LABELED_ROOT, 'all', 'training.pickle') if os.path.isfile(measures_file): measures_df = pd.read_pickle(measures_file) joined_df = measures_df.join(extra_df) joined_path = os.path.join(output_folder, 'measures_w_meta.pkl') logging.debug(f'Saving joined dataframe at {joined_path}:\n' f'{joined_df}') joined_df.to_pickle(joined_path)