def buildDataset(cls, all_results: Dataset, testie: Testie) -> List[tuple]: dtype = testie.variables.dtype() y = OrderedDict() dataset = [] for i, (run, results_types) in enumerate(all_results.items()): vars = list(run.variables.values()) if not results_types is None and len(results_types) > 0: dataset.append(vars) for result_type, results in results_types.items(): r = np.mean(results) y.setdefault(result_type, []).append(r) dtype['values'] = [None] * len(dtype['formats']) for i, f in enumerate(dtype['formats']): if f is str: dtype['formats'][i] = int values = OrderedSet() for row in dataset: values.add(row[i]) row[i] = values.index(row[i]) dtype['values'][i] = list(values) X = np.array(dataset, ndmin=2) lset = [] for result_type, v in y.items(): lset.append((result_type, X, np.array(v),dtype)) return lset
def writeversion(self, testie, all_results: Dataset, allow_overwrite: bool = False, kind=False, reload=True): if not reload and all_results: prev = self.load_results(testie=testie, kind=kind, cache=True) if prev and len(all_results) < len(prev): print( "ERROR ! Have less results than before. Forcing update write !" ) reload = True return if reload: results = self.load_results(testie=testie, kind=kind, cache=False) if results: results.update(all_results) all_results = results if kind: for kind, kresult in all_results.items(): filename = self.__resultFilename(testie) + '-' + kind self._writeversion(filename, kresult, allow_overwrite) else: filename = self.__resultFilename(testie) self._writeversion(filename, all_results, allow_overwrite)
def writeversion(self, script, all_results: Dataset): filename = self.__resultFilename(script) try: if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) except OSError: print("Error : could not create %s" % os.path.dirname(filename)) f = open(filename, 'w+') f.seek(0) for run, results in all_results.items(): v = [] for key, val in sorted(run.variables.items()): if type(val) is tuple: val = val[1] v.append(key + ":" + str(val).replace(':','\:')) type_results = [] for t,r in results.items(): str_results = [] if r is None: pass else: for val in r: str_results.append(str(val)) type_results.append(t+':'+(','.join(str_results))) f.write(','.join(v) + "={" + '},{'.join(type_results) + "}\n") f.close()
def buildDataset(cls, all_results: Dataset, testie: Testie): dtype = testie.variables.dtype() y = [] dataset = [] for i, (run, results) in enumerate(all_results.items()): vars = list(run.variables.values()) if not results is None: dataset.append(vars) y.append(np.mean(results)) dtype['formats'] = dtype['formats'] dtype['names'] = dtype['names'] for i, f in enumerate(dtype['formats']): if f is str: dtype['formats'][i] = int values = OrderedSet() for row in dataset: values.add(row[i]) row[i] = values.index(row[i]) X = np.array(dataset, ndmin=2) return X, np.array(y, dtype=[('result', float)])
def run(build: Build, all_results: Dataset, testie: Testie, max_depth=3, filename=None): print("Building dataset...") X, y = Statistics.buildDataset(all_results, testie) print("Learning dataset built with %d samples and %d features..." % (X.shape[0], X.shape[1])) clf = tree.DecisionTreeRegressor(max_depth=max_depth) clf = clf.fit(X, y) if max_depth is None or max_depth > 8: print("No tree graph when maxdepth is > 8") else: dot_data = tree.export_graphviz( clf, out_file=None, filled=True, rounded=True, special_characters=True, feature_names=testie.variables.dtype()['names']) graph = pydotplus.graph_from_dot_data(dot_data) if filename: f = filename else: f = build.result_path(testie.filename, 'pdf', suffix='_clf') graph.write(f, format=os.path.splitext(f)[1][1:]) print("Decision tree visualization written to %s" % f) print("") print("Feature importances :") # noinspection PyUnresolvedReferences for key, f in zip(testie.variables.dtype()['names'], clf.feature_importances_): print(" %s : %0.2f" % (key, f)) vars_values = {} for run, results in all_results.items(): for k, v in run.variables.items(): vars_values.setdefault(k, set()).add(v) print('') print("Better :") best = X[y['result'].argmax()] print(" ", end='') f = next(iter(all_results.items())) for i, (k, v) in enumerate(f[0].variables.items()): print("%s = %s, " % (k, best[i]), end='') print(' : %.02f' % y['result'].max()) print('') print("Means and std/mean per variables :") for k, vals in vars_values.items(): if len(vals) is 1: continue print("%s :" % k) for v in sorted(vals): tot = 0 std = 0 n = 0 for run, results in all_results.items(): if run.variables[k] == v: if not results is None: tot += np.mean(results) std += np.std(results) n += 1 if n == 0: print(" %s : None" % v) else: print(" %s : (%.02f,%.02f), " % (v, tot / n, std / n / (tot / n))) print("")