def naive(adjuster, segments): """ implementation of the naive algorithl to find crossing between segments """ results = {} graph = [[0], [0]] # This will be useful to observe the time complexity start = time() finished = special.binom( len(segments), 2) # This will be useful to print a progress bar in the console segments_processed = 0 for segment_1, segment_2 in combinations(segments, 2): if time() - start >= 1200: return results, graph new_intersection = segment_1.intersection_with(segment_2) if new_intersection is not None: new_intersection = adjuster.hash_point(new_intersection) if new_intersection not in segment_1.endpoints + segment_2.endpoints: for segment in [segment_1, segment_2]: if segment in results: results[segment] += [new_intersection] else: results[segment] = [new_intersection] segments_processed += 1 graph[0] += [time() - start] graph[1] += [len(list(set().union(*results.values())))] progress_bar(finished - segments_processed, finished) return results, graph
def train_epoch(model, opt, lr_scheduler, epoch, dataloader, gpu_id=0, verbose=True): _ = model.train() batches_per_epoch = len(dataloader) train_loss, correct, total = 0, 0, 0 for batch_idx, (data, targets) in enumerate(dataloader): data, targets = Variable(data.cuda(gpu_id)), Variable( targets.cuda(gpu_id)) # Set LR LRSchedule.set_lr(opt, lr_scheduler(epoch + batch_idx / batches_per_epoch)) opt.zero_grad() outputs = model(data) loss = F.cross_entropy(outputs, targets) loss.backward() opt.step() train_loss += loss.data[0] predicted = torch.max(outputs.data, 1)[1] total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() if verbose: progress_bar( batch_idx, batches_per_epoch, 'Loss: %.3f | Acc: %.3f%%' % (train_loss / (batch_idx + 1), 100. * correct / total)) return float(correct) / total
def _learn_low_rank_metric_gradient_descent(x, relations, rank, S, cost, tol, step, max_iter, verbose): """ This method is used by learn_low_rank_metric as an optimization subprocedure. See learn_low_rank_metric for more information. """ G = np.random.randn(rank, dim_count) * 0.01 converged = False for e in range(max_iter): if verbose: helpers.progress_bar(current=e+1, max=max_iter, update_freq=int(max_iter/100)) Gold = G # form the matrix \sum_{violated ijk} [(x[i]-x[j])(x[i]-x[j])^T - (x[i]-x[k])(x[i] - x[k])^T] GS = np.zeros((rank, dim_count)) for (i, j, k) in relations: dij, dist_ij = _mahalonobis_distance(x[i, :], x[j, :], G, type='low_rank') dik, dist_ik = _mahalonobis_distance(x[i, :], x[k, :], G, type='low_rank') if dist_ij - dist_ik + 1.0 >= 0.0: GS += np.dot(G, S[(i, j, k)]) # gradient descent step grad_G = G + (2 * cost * GS) G = G - (step * grad_G) # check convergence if np.sum(np.square(G - Gold)) < tol: converged = True if verbose: print('\nConverged at {0:d}'.format(e)) break return G, converged
def augment(x, channel_shift): print("Performing channel shifts...") helpers.progress_bar(0, x.shape[0], prefix="Progress", suffix="Complete", length=30, fill="=") for row in range(x.shape[0]): for ch in range(x.shape[3]): x[row] += random.randrange(channel_shift[0], channel_shift[1]) helpers.progress_bar(row + 1, x.shape[0], prefix="Progress", suffix="Complete", length=30, fill="=") x[x > 255] = 255 x[x < 0] = 0 return x
def bentley_ottmann(adjuster, segments): """ implementation of the Bentley Ottmann algorithm """ events = Events(segments) living = [] results = {} graph = [[0], [0]] # This will be useful to observe the time complexity start = time() finished = len(events.heap) # This will be useful to print a progress bar in the console while events.heap and time() - start < 1200: current_point, intersection, lower, upper, horizontal = events.pop_event() """ - lower = [segments that have the current_point as a lower endpoint] - upper = [segments that have the current_point as an upper endpoint] - intersection [segments that strike through the current_point but where the current_point is not an endpoint of the said segments] - horizontal = [horizontal segments that have the current_point as an endpoint] """ for segment in horizontal: for other_segment in living: find_new_event(segment, other_segment, current_point, events, results, adjuster) for segment in upper: left_segment, right_segment = nearest_living(segment, living) find_new_event(left_segment, right_segment, current_point, events, results, adjuster) living.remove(segment) for segment in lower: living.append(segment) living.sort(key=lambda segment: living_key(segment, current_point, adjuster)) left_segment, right_segment = nearest_living(segment, living) find_new_event(segment, right_segment, current_point, events, results, adjuster) find_new_event(left_segment, segment, current_point, events, results, adjuster) for segment in intersection: living.sort(key=lambda segment: living_key(segment, current_point, adjuster)) left_segment, right_segment = nearest_living(segment, living) find_new_event(segment, right_segment, current_point, events, results, adjuster) find_new_event(left_segment, segment, current_point, events, results, adjuster) graph[0] += [time() - start] graph[1] += [len(list(set().union(*results.values())))] progress_bar(len(events.heap), finished) return results, graph
def evaluate(self): self.update_config() if self._model is not None: total = {"tp": 0, "tn": 0, "fp": 0, "fn": 0, "n": 0, "p": 0, "t": 0, "f": 0} for directory in self._evaluate_data: summary = self.set_summary() image_paths = self.get_image_paths2("../" + directory) X, Y = self.get_data(image_paths) image_paths = [path for paths in image_paths for path in paths] print("Evaluating dataset") with open("../results/cell_data/" + directory.split("/")[-1] + ".csv", "w") as file: file.write("cell path,status,confidence\n") count = 0 helpers.progress_bar(0, self._evaluate_batches, prefix="Progress", suffix="Complete", length=30, fill="=") for batch in range(self._evaluate_batches): x, y = self.get_batch(X, Y, batch, self._evaluate_batches) # x = self.normalise(x) x /= 255 predictions = self._model.predict(x, batch_size=self._batch_size, verbose=0) for label, prediction in zip(y, predictions): status = self.get_status(label, self.get_prediction(prediction, self._threshold)) summary[status] += 1 with open("../results/cell_data/" + directory.split("/")[-1] + ".csv", "a") as file: file.write(",".join([image_paths[count], status, str(prediction[1])]) + "\n") count += 1 helpers.progress_bar(batch + 1, self._evaluate_batches, prefix="Progress", suffix="Complete", length=30, fill="=") summary["sample"] = directory.split("/")[-1] try: summary["sensitivity"] = summary["tp"] / (summary["tp"] + summary["fn"]) except ZeroDivisionError: print("Warning: no positive cases.") try: summary["specificity"] = summary["tn"] / (summary["tn"] + summary["fp"]) except ZeroDivisionError: print("Warning: no negative cases.") print(summary) total["tp"] += summary["tp"] total["fp"] += summary["fp"] total["tn"] += summary["tn"] total["fn"] += summary["fn"] total["n"] = total["tn"] + total["fp"] total["p"] = total["tp"] + total["fn"] total["t"] = total["tp"] + total["tn"] total["f"] = total["fp"] + total["fn"] print(total) else: print("Warning: no model configured.")
def convert_dwt_images(lead): data_x, data_y, fnames = dgen.get_data( # n_files=1, targets=cfg.targets, return_fnames=True, channels=[lead], norm=True) for i, ecg in enumerate(data_x): title = fnames[i].split('.')[0] save_wavelet_img([i for i in range(data_x.shape[1])], ecg[:, 0], np.arange(1, 128, 2), title=title) # used_fnames[fnames[i]] += 1 progress_bar("Converting to DWT image", i, data_x.shape[0])
def __init__(self, alp, size=100): self._alp = alp # instance of the aircraft landing problem self._members = list() for i in range(size): if i < size - 3: # random individuals self._members.append( Individual(alp, mode=Individual.Mode.random)) elif i == size - 3: # heuristic individuals self._members.append( Individual(alp, mode=Individual.Mode.earliest_h)) elif i == size - 2: self._members.append( Individual(alp, mode=Individual.Mode.target_h)) elif i == size - 1: self._members.append( Individual(alp, mode=Individual.Mode.latest_h)) hlp.progress_bar(current=i + 1, end=size, title=format('[ INIT POP ]')) # Initial sorting according to fitness self._members = sorted(self._members) print('\r[ INIT POP ] Size: %d / Best fitness: %d' % (len(self._members), max(self._members).fitness), flush=True) # Setup graph structure # - stores distances below below threshold # - makes it easy to derive maximum independent sets (parent selection) self._graph = nx.Graph() self._threshold = self._alp.nr_planes / 10 # Add each individual as node to the graph for individual in self._members: self._graph.add_node(individual) # For each pair of individuals that are too close, add an edge to the graph relations = [(ind_a, ind_b) for ind_a in self._members for ind_b in self._members if ind_b != ind_a] for ind_a, ind_b in relations: if not self._graph.has_edge(ind_a, ind_b): distance = ind_a.distance(ind_b) if distance < self._threshold: self._graph.add_edge(ind_a, ind_b, weight=distance)
def _learn_diagonal_metric_gradient_descent(cost, dist_mat, dist_squared, max_iter, tol, step, verbose): """ This method is used by learn_diagonal_metric as an optimization procedure. See learn_diagonal_metric for details. """ dim_count = dist_mat.shape[0] relation_count = dist_mat.shape[1] alpha = np.abs(np.random.randn(relation_count)) * 0.01 beta = np.abs(np.random.randn(dim_count)) * 0.01 converged = False for e in range(max_iter): if verbose: helpers.progress_bar(current=e, max=max_iter-1, update_freq=int(max_iter/100.0)) # calculate the gradients grad_alpha = 1 + np.dot(dist_mat.T, beta) - np.dot(dist_squared, alpha) grad_beta = np.dot(dist_mat, alpha) - beta # check stationarity conditions # if \beta_d >= 0 then grad_beta_d = 0.0 # if \beta_d = 0 then grad_beta_d < 0.0 # if cost > \alpha_r > 0 then grad_alpha_r = 0.0 # if \alpha_r = 0 then grad_alpha_r < 0.0 # if \alpha_r = cost then grad_alpha_r > 0.0 if np.allclose(a=grad_beta[beta > 0.0], b=0.0, atol=tol) and \ np.all(grad_beta[np.isclose(a=beta, b=0.0, atol=tol)] < 0.0) and \ np.allclose(a=grad_alpha[np.logical_and(alpha > 0.0, alpha < cost)], b=0.0, atol=tol) and \ np.all(grad_alpha[np.isclose(a=alpha, b=0.0, atol=tol)] < 0.0) and \ np.all(grad_alpha[np.isclose(a=alpha, b=cost, atol=tol)] > 0.0): converged = True if verbose: print("\nConverged at {0:d}".format(e)) break # gradient ascent update alpha = alpha + step * grad_alpha beta = beta + step * grad_beta # projection step alpha[alpha < 0.0] = 0.0 alpha[alpha > cost] = cost beta[beta < 0.0] = 0.0 return alpha, beta, converged
def get_data(self, image_paths): n = sum([len(paths) for paths in image_paths]) x = np.empty((n, self._image_shape[0], self._image_shape[1], self._image_shape[2]), dtype=np.uint8) y = np.empty((n, 1), dtype=np.uint8) row = 0 for label, paths in enumerate(image_paths): print("Loading images from class " + str(label)) l = len(paths) helpers.progress_bar(0, l, prefix="Progress", suffix="Complete", length=30, fill="=") for i, path in enumerate(paths): y[row] = label x[row] = cv2.imread(path) row += 1 helpers.progress_bar(i + 1, l, prefix="Progress", suffix="Complete", length=30, fill="=") return x, y
def eval_epoch(model, dataloader, gpu_id=0, verbose=True): _ = model.eval() batches_per_epoch = len(dataloader) eval_loss, correct, total = 0, 0, 0 for batch_idx, (data, targets) in enumerate(dataloader): data, targets = Variable(data.cuda(gpu_id), volatile=True), Variable(targets.cuda(gpu_id)) outputs = model(data) loss = F.cross_entropy(outputs, targets) eval_loss += loss.data[0] predicted = torch.max(outputs.data, 1)[1] total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() if verbose: progress_bar( batch_idx, batches_per_epoch, 'Loss: %.3f | Acc: %.3f%%' % (eval_loss / (batch_idx + 1), 100. * correct / total)) return float(correct) / total
def preprocess_data(data_x, smooth_window_size=51, smooth_order=4, fourier_baseline_resolution=20, verbosity=False): """ function: preprocess_data preprocess the data by smoothing and straightening. Args: data_x : np.ndarray the data to preprocess. Returns: p_data_x : np.ndarray preprocessed data """ assert data_x.ndim == 3 if verbosity: print("Preprocessing data...") start = time.time() p_data_x = np.empty(shape=data_x.shape) for i, ecg in enumerate(data_x): for channel in range(ecg.shape[1]): prepped_channel = savitzky_golay(ecg[:, channel], window_size=smooth_window_size, order=smooth_order) prepped_channel = fourier_straighten( prepped_channel, resolution=fourier_baseline_resolution) p_data_x[i, :, channel] = prepped_channel if verbosity: progress_bar("Processed", i, data_x.shape[0]) if verbosity: print('\nDone, took ' + str(round(time.time() - start, 1)) + ' seconds') return p_data_x
def evaluate_for_all_thresholds(self, steps=21): self.update_config() if self._model is not None: data = {"Threshold": [], "TN": [], "TP": [], "FN": [], "FP": [], "N": [], "P": [], "T": [], "F": [], "TNR": [], "FNR": [], "TPR": [], "FPR": [], "PPV": [], "NPV": [], "LRp": [], "LRn": [], "ACC": [], "F1": [], "MCC": [], "Informedness": [], "Markedness": []} for threshold in np.linspace(0, 1, steps): print("Threshold: " + str(threshold)) summary = {"tn": 0, "tp": 0, "fn": 0, "fp": 0} for directory in self._evaluate_data: image_paths = self.get_image_paths2("../" + directory) X, Y = self.get_data(image_paths) print("Evaluating dataset") helpers.progress_bar(0, self._evaluate_batches, prefix="Progress", suffix="Complete", length=30, fill="=") for batch in range(self._evaluate_batches): x, y = self.get_batch(X, Y, batch, self._evaluate_batches) # x = self.normalise(x) x /= 255 predictions = self._model.predict(x, batch_size=self._batch_size, verbose=0) for label, prediction in zip(y, predictions): status = self.get_status(label, self.get_prediction(prediction, threshold)) summary[status] += 1 helpers.progress_bar(batch + 1, self._evaluate_batches, prefix="Progress", suffix="Complete", length=30, fill="=") data["Threshold"].append(threshold) data = self.fill_data(data, summary) data = pd.DataFrame.from_dict(data) data.to_csv("../data.csv", sep=",") else: print("Warning: model not configured.")
def extract_windows(data_x, data_y, pulse_size, fnames=[], verbosity=False): """ function : extract_windows extract all pulses from an ecg and scale them to a given size Args: data_x : np.ndarray an array of ECG's data_y : np.ndarray an array of targets of the ECG's pulse_size : int [optional, default: 80] the size to scale the pulses to exclude_first_channel : bool [optional, default: False] whether to extract pulses from the first channel. used when only rpeak information is needed from first channel Returns: pulse_data_x : np.ndarray an array of pulses pulse_data_y : np.ndarray an array of targets of the corresponding pulses """ if verbosity: start = time.time() print("Extracting and scaling pulses from ECG's...") n_samples, n_points, n_channels = data_x.shape # if exclude_first_channel: # n_channels = max(n_channels - 1, 1) pulses = np.empty(shape=(n_samples * 25, pulse_size, n_channels)) pulse_targets = np.empty(shape=(n_samples * 25)) pulse_n = 0 new_fnames = [] for i, ecg in enumerate(data_x): # We assume lead 0 is a lead where we can extract rpeaks rpeaks = get_rpeaks(ecg.T[0]) ecg_start = 0 for rpeak_n in range(1, len(rpeaks) - 1): pulse = ecg[rpeaks[rpeak_n]:rpeaks[rpeak_n + 1], :] try: pulses[pulse_n, :, :] = pulse_scale(pulse, pulse_size) pulse_n += 1 pulse_targets[pulse_n] = data_y[i] if fnames: new_fnames.append(fnames[i].split('.')[0] + "_" + str(ecg_start) + ".csv") ecg_start += 1 except: pass ecg_start = 0 if verbosity: progress_bar("Extracted pulses from ECG", i, n_samples) if verbosity: print('Done, took ' + str(round(time.time() - start, 1)) + ' seconds') if len(fnames) > 0: return pulses[:pulse_n], pulse_targets[:pulse_n], new_fnames # make sure the data is of the correct length return pulses[:pulse_n], pulse_targets[:pulse_n]
def generate_parent_sets(self): """ Generates a list of parent sets for child generation Generates a set of parents according section 5.5 in Pinol & Beasley (2006). In the parent selection process a distance measure is introduced to keep diversity in the parent set high. Nodes whose distance is less than a specified threshold value have an edge. When selecting a node for the parent set, each of his neighbours cannot be added to the same set. Furthermore, better individuals have higher probability of being selected for a parent set because the inclusion frequency corresponds to their rank. Returns: parent_sets (list of sets): list of parent sets, where each parent set can have different sizes """ # update sorting to obtain valid ranks self._members = sorted(self._members) # start with graph that contains all possible # nodes and edges and assign ranks according to # each individuals fitness main_graph = self._graph.copy() for rank, individual in enumerate(self._members): # asc sorting --> worst individual is assigned # lowest rank main_graph.node[individual]['rank'] = rank + 1 # individuals with distance below threshold # must have an edge --> others are removed # iteratively to reach reasonable number of edges max_nr_edges = len(self._members) * (len(self._members) - 1) / 2 theta = self._threshold while main_graph.number_of_edges() > max_nr_edges / 2: # get edges to be removed because of too large distance edges = [(f, t) for (f, t, w) in main_graph.edges(data='weight') if w >= theta] main_graph.remove_edges_from(edges) # Further reduce number of edges in next iteration theta = theta / 2.0 total_nr_parents = sum( [rank for (parent, rank) in main_graph.nodes(data='rank')]) # obtain sets of parent individuals while rank # (= inclusion frequency) greater than zero parent_sets = [] while len(main_graph) > 0: parent_set = set() set_graph = main_graph.copy() while len(set_graph) > 0: # pick random node individual = rd.choice(list(set_graph)) parent_set.add(individual) new_rank = main_graph.node[individual]['rank'] - 1 if new_rank <= 0: # remove node from initial graph main_graph.remove_node(individual) else: main_graph.node[individual]['rank'] = new_rank neighbors = list(set_graph.neighbors(individual)) set_graph.remove_node(individual) set_graph.remove_nodes_from(neighbors) parent_sets.append(parent_set) # Print progress nr_parents = total_nr_parents - sum( [r for (n, r) in main_graph.nodes(data='rank')]) hlp.progress_bar(nr_parents, total_nr_parents, '[ SELECTION ]') parent_sets = [p_set for p_set in parent_sets if len(p_set) > 1] print('\r[ SELECTION ] Sets generated: %d' % (len(parent_sets)), flush=True) return parent_sets, theta
def generate_children(self, parent_sets): """Generates a set of children from a given set of parents Generates a set of children from the given parentset according to section 5.6 in Pinol & Beasley (2006). Then checks if an individual with the same sequence already exists in the population and removes this child in that case according to section 5.7. Locally improves every child from the children set according to section 5.8, depending on whether the non-linear objective or linear objective is chosen. Args: parent_sets (list of sets): sets of individuals Returns: children (list of individuals): list of generated children """ children = [] for set_nr, parent_set in enumerate(parent_sets): if parent_set is None: return # generate random weights for each parent abs_weights = [rd.random() for _ in range(len(parent_set))] # normalize weights sum_of_weights = sum(abs_weights) rel_weights = [w / sum_of_weights for w in abs_weights] chromosome = [] for i in range(self._alp.nr_planes): # determine proportion value parent_props = [ parent.chromosome[i][1] for parent in parent_set ] child_prop = round( sum([w * p for w, p in zip(rel_weights, parent_props)]), 6) # determine runway parent_runways = [ parent.chromosome[i][2] for parent in parent_set ] child_rw = rd.choice(parent_runways) # add to chromosome chromosome.append((i, child_prop, child_rw)) child = Individual(alp=self._alp, mode=Individual.Mode.child, chromosome=chromosome, parents=parent_set) # exclude duplicates with respect to the current population if not self._duplicate(child): child.improve() children.append(child) # Print progress hlp.progress_bar(current=set_nr + 1, end=len(parent_sets), title=format('[ CROSSOVER ]')) # Print information if len(children) > 0: print('\r[ CROSSOVER ] Children generated: %d' % len(children), flush=True) else: print('\r[ CROSSOVER ] No children generated', flush=True) return children
def evaluate_model(data_x=[], targets=[], fnames=[], model=None): """ function : evaluate_model create an evaluation (accuracy) for classification based on a threshold at least 'threshold' pulses in an ecg must be classified as unhealthy for the whole ecg to be unhealthy Args: n_ecgs : int or Nonetype [optional, default: None] the number of ecg's to base accuracy on threshold : int [optional, default: 2] the number of pulses that need to be unhealthy for the ecg to be labeled as unhealthy Returns: accuracy : float the accuracy of the modeled tested on ecg's """ if len(data_x) == 0: data_x, targets, fnames = dgen.get_data(return_fnames=True, channels=np.array([0]), norm=True, exclude_targets=[2, 3, 4]) if model == None: model = load_model(cfg.model_save_name, custom_objects={ 'precision': precision, 'recall': recall }) # n_correct = 0 tp = 0 tn = 0 fp = 0 fn = 0 predictions = [] mse = 0 if cfg.verbosity: print("Evaluating model with ECG's") start = time.time() for i, ecg in enumerate(data_x): # print(ecg.shape) pulse_data_x, pulse_data_y = dprep.extract_windows( np.expand_dims(ecg, axis=0), np.array([targets[i]]), cfg.nn_input_size, exclude_first_channel=True) nn_pulse_data_x = {"ecg_inp": np.squeeze(pulse_data_x)} # preds = model.predict(nn_pulse_data_x) preds = [ int(round(pred[0])) for pred in model.predict(nn_pulse_data_x) ] pred = 1 if sum(preds) >= len( preds) * cfg.min_af_ratio_for_positive_prediction else 0 mse += (targets[i] - pred)**2 predictions.append(pred) if pred == 1 and targets[i] == 1: tp += 1 elif pred == 0 and targets[i] == 0: tn += 1 elif pred == 1 and targets[i] == 0: fp += 1 elif pred == 0 and targets[i] == 0: fn += 1 progress_bar("Evaluating ECG", i, data_x.shape[0]) if cfg.verbosity: print('Done, took ' + str(round(time.time() - start, 1)) + ' seconds') mse /= len(targets) ppv, tpr, thresholds_pr = precision_recall_curve(targets, predictions) fpr, tpr, thresholds_roc = roc_curve(targets, predictions) fpr_tpr_auc = sklearn_auc(fpr, tpr) tpr_ppv_auc = sklearn_auc(tpr, ppv) accuracy = (tp + tn) / len(targets) precision = tp / (tp + fp) recall = tp / (tp + fn) # specificity = tn/(fp + tn) f1 = (2 * tp) / (2 * tp + fp + fn) metrics = [mse, accuracy, precision, recall, fpr_tpr_auc, tpr_ppv_auc, f1] print(metrics) return metrics
def get_data(cfg=None, n_files=None, split=False, channels=[], targets=[], return_fnames=False, randomize_order=False, extension='.csv', n_points=None, include_first_channel=False, location=None, filename_fmt=None, filename_sep="_", verbosity=None, open_files=[]): """ function: get_data returns data in the directory specified in the helpers.py file Args: n_files : (Nonetype or int) [optional, default: None] the number of samples to return, return all available data if set to None extension : str [optional, default: '.csv'] the extension (filtype) of the data. can be anything, as long as it's readable by np.loadtxt split : (bool or str) [optional, default: False] to split data 50/50 into healthy/non-healthy or not (only works if target is set to None) if set to 'max', the function will determine what the max amount of files is while keeping the ration 50/50 (will override n_files) channels : (Nonetype or np.array) [optional, default: None] indices of channels to return or None for all channels targets : (list) [optional, default: []] a list of conditions to return return_fnames : bool [optional, default: False] wheter to return a the filenames of the data randomize_order : bool [optional, default: True] whether to randomize the order of the data n_points : int [optional, default: None] the number of data points to exctract include_first_channel : bool [optional, default: False] whether to return an extra copy of the first channels (for determining rpeaks in data from other channels) unique_patients : bool [optional, default: False] whether to only use one ecg per patient to reduce bias location : str or Nonetype [optional, default: None] the location to load the data from. if None loads the processed data location specified in the config Returns: data_x : np.ndarray the ecg data itself as a 3D array with shape (n_ecgs, ecg_len, n_channels) data_y : np.ndarray an array of target variables files : list [optional] a list of all files """ if cfg == None: cfg = global_params.cfg # if delimiter == None: # delimiter = cfg.delimiter if verbosity == None: verbosity = cfg.verbosity if verbosity: print("Assembling data from files...") start = time.time() if channels == []: channels = [x for x in range(cfg.n_channels)] n_channels = len(channels) if include_first_channel and 0 not in channels: channels = [0] + channels n_channels += 1 if location == None: location = cfg.processed_data_location if targets == []: targets = cfg.targets # get a list of all filenames used_patients = [] if len(open_files) == 0: filters = {} if targets: filters["TARGET"] = targets all_files = get_filenames(location, extension, filters) else: all_files = open_files # set number of files to all files if target number is not specified if type(n_files) != int or n_files != len(all_files): n_files = len(all_files) # handle the case where the data has to be split with specified amount if split != "max" and split: # all healthy files sr_files = [f for f in all_files if filename_info(f, "TARGET") == "SR"] # all non-healthy files asr_files = [ f for f in all_files if filename_info(f, "TARGET") != "SR" ] try: # try to get a random sample of these files of the amount specified files = random.sample(sr_files, int(n_files / 2)) files += random.sample(asr_files, int(n_files / 2)) except ValueError: # if thats not possible, the max amount that can still be loaded # will be used. warnings.warn("Not enough files with given target for requested \ amount, continuing with lower amount to maintain split.") split = "max" # handle the case where as many files as possible have to be gotten but the # split must be maintained if split == "max": sr_files = [] asr_files = [] for f in all_files: # create lists of healthy and non-healthy files if filename_info(f, "TARGET") == "SR": # target is sinus rythm sr_files.append(f) else: asr_files.append(f) # check which of the two lists is smaller, and set this to the size of # the sample that has to be taken from both m_files = min([len(sr_files), len(asr_files)]) # concatenate these samples files = random.sample(sr_files, m_files) files += random.sample(asr_files, m_files) # reset number of files (since the number was found by checking what # the max amount is without losing the 50/50 ratio) n_files = len(files) if not split: # if no split is required, just take a random subset of the data files = random.sample(all_files, n_files) if randomize_order: # specified by args np.random.shuffle(files) if len(files) != n_files: warnings.warn( "The amount of files loaded is not the same as the amount requested" ) if n_points == None: n_points = cfg.n_points data_x = np.empty(shape=(n_files, n_points, n_channels)) data_y = np.zeros(shape=(n_files, )) for i, fname in enumerate(files): ecg = np.loadtxt(location + fname, delimiter=cfg.delimiter, dtype=np.float32, usecols=channels, ndmin=2) if cfg.normalize_data: # specified by args # divide each value in the ecg by the max of its column ecg = ecg / np.amax(np.abs(ecg), axis=0)[None, :] data_x[i, :, :] = ecg # data_y[i] = 0 if filename_info(fname, "SEX") == "M" else 1 data_y[i] = getattr(cfg, filename_info(fname, "TARGET")[:2]) if verbosity: progress_bar("Load ECG", i, n_files) if verbosity: print('Done, took ' + str(round(time.time() - start, 1)) + ' seconds') if return_fnames: # specified by args return data_x, data_y, files return data_x, data_y