def train_test_split(Xy): """Split Xy into two dictonaries. If input dictonnary whas not build with train_test_merge(Xy1, Xy2) then return twice the input dictonnary. Parameters ---------- Xy: dict Returns ------- dict1, dict2 : splited dictionaries Example ------- >>> train_test_merged = train_test_merge(dict(a=1, b=2), dict(a=33, b=44, c=55)) >>> print train_test_merged {'c/test': 55, 'a/test': 33, 'b/test': 44, 'a/train': 1, 'b/train': 2} >>> print train_test_split(train_test_merged) ({'a': 1, 'b': 2}, {'a': 33, 'c': 55, 'b': 44}) >>> print train_test_split(dict(a=1, b=2)) ({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) """ keys_train = [k for k in Xy if (key_pop(k)[1] == conf.TRAIN)] keys_test = [k for k in Xy if (key_pop(k)[1] == conf.TEST)] if not keys_train and not keys_test: return Xy, Xy if keys_train and keys_test: Xy_train = {key_pop(k)[0]: Xy[k] for k in keys_train} Xy_test ={key_pop(k)[0]: Xy[k] for k in keys_test} return Xy_train, Xy_test raise KeyError("data-flow could not be splitted")
def train_test_split(Xy): """Split Xy into two dictonaries. If input dictonnary whas not build with train_test_merge(Xy1, Xy2) then return twice the input dictonnary. Parameters ---------- Xy: dict Returns ------- dict1, dict2 : splited dictionaries Example ------- >>> train_test_merged = train_test_merge(dict(a=1, b=2), dict(a=33, b=44, c=55)) >>> print(train_test_merged) {'c/test': 55, 'a/test': 33, 'b/test': 44, 'a/train': 1, 'b/train': 2} >>> print(train_test_split(train_test_merged)) ({'a': 1, 'b': 2}, {'a': 33, 'c': 55, 'b': 44}) >>> print(train_test_split(dict(a=1, b=2))) ({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) """ keys_train = [k for k in Xy if (key_pop(k)[1] == conf.TRAIN)] keys_test = [k for k in Xy if (key_pop(k)[1] == conf.TEST)] if not keys_train and not keys_test: return Xy, Xy if keys_train and keys_test: Xy_train = {key_pop(k)[0]: Xy[k] for k in keys_train} Xy_test = {key_pop(k)[0]: Xy[k] for k in keys_test} return Xy_train, Xy_test raise KeyError("data-flow could not be splitted")
def reduce(self, store_results=True): # Terminaison (leaf) node return results if not self.children: return self.load_state(name="results") # 1) Build sub-aggregates over children children_results = [child.reduce(store_results=False) for child in self.children] result_set = ResultSet(*children_results) if not self.reducer: return result_set # Group by key, without consideration of the fold/permutation number # which is the head of the key # use OrderedDict to preserve runing order from collections import OrderedDict groups = OrderedDict() for result in result_set: # remove the head of the key _, key_tail = key_pop(result["key"], index=0) result["key"] = key_tail key_tail = result["key"] if not key_tail in groups: groups[key_tail] = list() groups[key_tail].append(result) # For each key, stack results reduced = ResultSet() for key in groups: result_stacked = Result.stack(*groups[key]) reduced.add(self.reducer.reduce(result_stacked)) return reduced
def reduce(self, result): if self.select_regexp: inputs = [key3 for key3 in result if re.search(self.select_regexp, str(key3))] else: inputs = result.keys() if len(inputs) != 2: raise KeyError("Need to find exactly two results to compute a score." " Found %i: %s" % (len(inputs), inputs)) key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0] key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0] y_true = result[key_true] y_pred = result[key_pred] try: # If list of arrays (CV, LOO, etc.) concatenate them y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) except ValueError: pass out = Result(key=result["key"]) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None) key, _ = key_pop(key_pred, -1) out[key_push(key, conf.SCORE_PRECISION)] = p out[key_push(key, conf.SCORE_RECALL)] = r out[key_push(key, conf.SCORE_RECALL_MEAN)] = r.mean() out[key_push(key, conf.SCORE_F1)] = f1 out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true, y_pred) if self.keep: out.update(result) return out
def reduce(self, store_results=True): # Terminaison (leaf) node return results if not self.children: return self.load_results() # 1) Build sub-aggregates over children children_results = [ child.reduce(store_results=False) for child in self.children ] result_set = ResultSet(*children_results) if not self.reducer: return result_set if not self.need_group_key: reduced = ResultSet() reduced.add(self.reducer.reduce(result_set)) return reduced # Group by key, without consideration of the fold/permutation number # which is the head of the key # use OrderedDict to preserve runing order from collections import OrderedDict groups = OrderedDict() for result in result_set: # remove the head of the key _, key_tail = key_pop(result["key"], index=0) result["key"] = key_tail if not key_tail in groups: groups[key_tail] = list() groups[key_tail].append(result) # For each key, stack results reduced = ResultSet() for key in groups: result_stacked = Result.stack(*groups[key]) reduced.add(self.reducer.reduce(result_stacked)) return reduced
def reduce(self, result): if self.select_regexp: inputs = [key3 for key3 in result if re.search(self.select_regexp, str(key3))] else: inputs = result.keys() if len(inputs) != 2: raise KeyError("Need to find exactly two results to compute a " "score. Found %i: %s" % (len(inputs), inputs)) key_true = [k for k in inputs if k.find(conf.TRUE) != -1][0] key_pred = [k for k in inputs if k.find(conf.PREDICTION) != -1][0] y_true = result[key_true] y_pred = result[key_pred] try: # If list of arrays (CV, LOO, etc.) concatenate them y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) except ValueError: pass out = Result(key=result["key"]) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, average=None) # Compute p-value of recall for each class def recall_test(recall, n_trials, apriori_p): n_success = recall * n_trials pval = binom_test(n_success, n=n_trials, p=apriori_p) if recall > apriori_p: return (pval / 2) else: return 1 - (pval / 2) n_classes = len(s) # Number of classes n_obs = len(y_true) prior_p = s.astype(np.float)/s.sum() # A priori probability of each class r_pvalues = np.zeros_like(r) for class_index in range(n_classes): n_trials = s[class_index] #print "Class {class_index}: {n_success} success on {n_trials} trials".format(n_success=n_success, n_trials=n_trials, class_index=class_index) r_pvalues[class_index] = recall_test(r[class_index], n_trials, prior_p[class_index]) # Compute p-value of mean recall mean_r = r.mean() mean_r_pvalue = binom_test(int(mean_r * n_obs), n=n_obs, p=.5) key, _ = key_pop(key_pred, -1) out[key_push(key, conf.SCORE_PRECISION)] = p out[key_push(key, conf.SCORE_RECALL)] = r out[key_push(key, conf.SCORE_RECALL_PVALUES)] = r_pvalues out[key_push(key, conf.SCORE_RECALL_MEAN)] = mean_r out[key_push(key, conf.SCORE_RECALL_MEAN_PVALUE)] = mean_r_pvalue out[key_push(key, conf.SCORE_F1)] = f1 out[key_push(key, conf.SCORE_ACCURACY)] = accuracy_score(y_true, y_pred) if self.keep: out.update(result) return out
def reduce(self, result): diff_perm_nbs = self.get_diff_perm_nbs(result) max_r2 = {} for perm_nb in diff_perm_nbs: max_r2[perm_nb] = self.get_max_r2_with_perm_nb(result, perm_nb) r2_no_perms = max_r2[0] count = 0 for i in max_r2: if i == 0: continue if r2_no_perms < max_r2[i]: count += 1 p_value = float(count) / float(len(max_r2)) _, res_key = key_pop(result.keys()[0], index=-1) out = Result(key=res_key) out["pval"] = p_value return out
def load(self, key=""): """Load everything that is prefixed with key. Parmaters --------- key: str if key point to a file (without the extension), return the file if key point to a directory, return a dictionary where values are objects corresponding to all files found in all sub-directories. Values are indexed with their keys. if key is an empty string, assume dirpath is a tree root. See Also -------- BaseNode.save() """ from epac.configuration import conf from epac.workflow.base import key_pop path = os.path.join(self.dirpath, key) #prefix = os.path.join(path, conf.STORE_FS_NODE_PREFIX) if os.path.isfile(path + conf.STORE_FS_PICKLE_SUFFIX): return self.load_pickle(path + conf.STORE_FS_PICKLE_SUFFIX) if os.path.isfile(path + conf.STORE_FS_JSON_SUFFIX): return self.load_pickle(path + conf.STORE_FS_JSON_SUFFIX) if os.path.isdir(path): filepaths = [] for base, dirs, files in os.walk(self.dirpath): #print base, dirs, files for filepath in [os.path.join(base, basename) for \ basename in files]: filepaths.append(filepath) loaded = dict() dirpath = os.path.join(self.dirpath, "") for filepath in filepaths: _, ext = os.path.splitext(filepath) if ext == conf.STORE_FS_JSON_SUFFIX: key1 = filepath.replace(dirpath, "").\ replace(conf.STORE_FS_JSON_SUFFIX, "") obj = self.load_json(filepath) loaded[key1] = obj elif ext == conf.STORE_FS_PICKLE_SUFFIX: key1 = filepath.replace(dirpath, "").\ replace(conf.STORE_FS_PICKLE_SUFFIX, "") loaded[key1] = self.load_pickle(filepath) else: raise IOError('File %s has an unkown extension: %s' % (filepath, ext)) if key == "": # No key provided assume a whole tree to load tree = loaded.pop(conf.STORE_EXECUTION_TREE_PREFIX) for key1 in loaded: key, attrname = key_pop(key1) #attrname, ext = os.path.splitext(basename) if attrname != conf.STORE_STORE_PREFIX: raise ValueError('Do not know what to do with %s') \ % key1 node = tree.get_node(key) if not node.store: node.store = loaded[key1] else: keys_local = node.store.dict.keys() keys_disk = loaded[key1].dict.keys() if set(keys_local).intersection(set(keys_disk)): raise KeyError("Merge store with same keys") node.store.dict.update(loaded[key1].dict) loaded = tree return loaded