def train(epigenomes, labels, models, kwargs, region, cell_line): epigenomes = epigenomes[region].values labels = labels[region] splits = 10 holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42) if os.path.exists(cell_line + "/results_" + region + ".json"): results = compress_json.local_load(cell_line + "/results_" + region + ".json") else: results = [] for i, (train, test) in tqdm(enumerate(holdouts.split(epigenomes, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True): for model, params in tqdm(zip(models, kwargs), total=len(models), desc="Training models", leave=False, dynamic_ncols=True): model_name = (model.__class__.__name__ if model.__class__.__name__ != "Sequential" else model.name) if precomputed(results, model_name, i): continue model.fit(epigenomes[train], labels[train], **params) results.append({ "model": model_name, "run_type": "train", "holdout": i, **report(labels[train], model.predict(epigenomes[train])) }) results.append({ "model": model_name, "run_type": "test", "holdout": i, **report(labels[test], model.predict(epigenomes[test])) }) compress_json.local_dump( results, cell_line + "/results_" + region + ".json") df = pd.DataFrame(results) df = df.drop(columns=["holdout"]) return df
def training_sequence_models(models, holdouts, cell_line, task): results = [] logging.info("Number of holdouts: {}".format(len(holdouts))) for i, (train, test) in enumerate(holdouts): for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if __precomputed(results, model.name, i): continue logging.info("Training model {} holdout {}".format(model.name, i)) history = model.fit(train, steps_per_epoch=train.steps_per_epoch, validation_data=test, validation_steps=test.steps_per_epoch, epochs=600, shuffle=True, verbose=False, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=10), ]).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append({ "model": model.name, "run_type": "train", "holdout": i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } }) results.append({ "model": model.name, "run_type": "test", "holdout": i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } }) logging.info( "Add results {} to Json --> results_sequence_{}.json".format( model.name, task)) compress_json.local_dump( results, "json/" + cell_line + "/results_sequence_" + task + ".json") return results
def test_compress_json(): D = random_string_dict(10, 10) key = sha256(D) extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys() for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.dump(D, path) assert key == sha256(compress_json.load(path)) shutil.rmtree("random_dirs") for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.local_dump(D, path) assert key == sha256(compress_json.local_load(path)) shutil.rmtree("tests/random_dirs")
def predict_sequences(sequences: pd.DataFrame, labels: pd.DataFrame, models: List[Model]) -> pd.DataFrame: filename = _get_filename("seq") if os.path.exists(filename): with open(filename) as json_file: results = json.load(json_file) else: results = [] for i, (train_index, test_index) in tqdm(enumerate(_get_holdouts().split(sequences, labels)), total=get_default('splits'), desc="Computing holdouts", dynamic_ncols=True): train, test = get_holdout(train_index, test_index, sequences, labels) for model, params in tqdm([model.get_model() for model in models], total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if _precomputed(results, model, i): continue history = model.fit( train, validation_data=test, steps_per_epoch=train.steps_per_epoch, validation_steps=test.steps_per_epoch, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=get_default('patience')) ], **params ).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append(_get_result(model, 'train', i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } )) results.append(_get_result(model, 'test', i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } )) compress_json.local_dump(results, filename) return pd.DataFrame(results).drop(columns=['holdout'])
def training_tabular_models(holdouts, models, kwargs, cell_line, task): results = [] logging.info("Number of holdouts: {}".format(len(holdouts))) for i, (train, test) in enumerate(holdouts): for model, params in tqdm(zip(models, kwargs), total=len(models), desc="Training models", leave=False, dynamic_ncols=True): model_name = (model.__class__.__name__ if model.__class__.__name__ != "Sequential" else model.name) if __precomputed(results, model_name, i): continue logging.info("Training model {} holdout {}".format(model_name, i)) model.fit(train["epigenomes"], train["labels"], **params) results.append({ "model": model_name, "run_type": "train", "holdout": i, **__report(train["labels"], model.predict(train["epigenomes"])) }) results.append({ "model": model_name, "run_type": "test", "holdout": i, **__report(test["labels"], model.predict(test["epigenomes"])) }) logging.info( "Add results {} to Json --> results_tabular_{}.json".format( model_name, task)) compress_json.local_dump( results, "json/" + cell_line + "/results_tabular_" + task + ".json") return results
def predict_epigenomics(data: np.ndarray, labels: pd.DataFrame, models: List[Model]) -> pd.DataFrame: filename = _get_filename("epi") if os.path.exists(filename): with open(filename) as json_file: results = json.load(json_file) else: results = [] for i, (train, test) in tqdm(enumerate(_get_holdouts().split(data, labels)), total=get_default('splits'), desc="Computing holdouts", dynamic_ncols=True): for model, params in tqdm([model.get_model() for model in models], total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if _precomputed(results, model, i): continue model.fit(data[train], labels[train], **params) results.append(_get_result(model, 'train', i, **_report(labels[train], model.predict(data[train])))) results.append(_get_result(model, 'test', i, **_report(labels[test], model.predict(data[test])))) print("end") compress_json.local_dump(results, filename) return pd.DataFrame(results).drop(columns=['holdout'])
def store_graph_data(self, data: Dict, graph_name: str) -> Dict: """Return the data stored for the provided graph. Parameters ----------------------- graph_name: str, Name of graph to store data for. Returns ----------------------- The stored data for this graph. """ return compress_json.local_dump( data, self.get_graph_data_path(graph_name) )
buff["state"].append(prev_state.tolist()) buff["action"].append(action) buff["reward"].append(reward) buff["done"].append(bool(done)) buff["next_state"].append(state.tolist()) buffer_size += 1 step += 1 print("\033[0;1;97m", end='') print("nb episode : {} Buffersize : {} / {} ".format( nb_episode, buffer_size, buff_max), end='') tac = time.time() print("\033[3;91m", end='') print("{} secondes".format(int(tac - tic)), end='') print("\033[0;m \r", end='') nb_episode += 1 print("\033[0;1;32m") print("sauvegarde du buffer...") if "--save" in sys.argv: arg_index = sys.argv.index("--save") save_name = sys.argv[arg_index + 1] compress_json.local_dump(buff, "preTrain/" + save_name + ".json.gz") print('sauvegardé en tant que "' + save_name + '"') else: compress_json.local_dump(buff, "preTrain/default.json.gz") print('sauvegardé en tant que "default"')
def _dump_unsupported_graphs(self, unsupported_graphs: Set[str]): """Return set of known unsupported graphs.""" compress_json.local_dump( unsupported_graphs, self.unsupported_graphs_path)
def _dump_corrupted_graphs(self, corrupted_graphs: Set[str]): """Return set of known corrupted graphs.""" compress_json.local_dump(corrupted_graphs, self.corrupted_graphs_path)
def train_sequence(epigenomes, labels, genome, cell_line, region, models): bed = epigenomes[region].reset_index()[epigenomes[region].index.names] splits = 2 holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42) if os.path.exists(cell_line + "/sequence_" + region + ".json"): results = compress_json.local_load(cell_line + "/sequence_" + region + ".json") else: results = [] for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels[region])), total=splits, desc="Computing holdouts", dynamic_ncols=True): train, test = get_holdout(train_index, test_index, bed, labels[region], genome) for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if precomputed(results, model.name, i): continue history = model.fit(train, steps_per_epoch=train.steps_per_epoch, validation_data=test, validation_steps=test.steps_per_epoch, epochs=100, shuffle=True, verbose=False, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50), ]).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append({ "model": model.name, "run_type": "train", "holdout": i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } }) results.append({ "model": model.name, "run_type": "test", "holdout": i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } }) compress_json.local_dump( results, cell_line + "/sequence_" + region + ".json") df = pd.DataFrame(results) df = df.drop(columns=["holdout"]) return df
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line): # Reprod os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) splits = 11 holdouts = StratifiedShuffleSplit( n_splits=splits, test_size=0.2, random_state=42) genome = Genome("hg19") bed = to_bed(epigenomes[region_type]) labels = nlabels[region_type].values.ravel() if os.path.exists(cell_line + "_" + region_type + "_sequence.json"): results = compress_json.local_load( cell_line + "_" + region_type + "_sequence.json") else: results = [] class_w = class_weight.compute_class_weight( 'balanced', np.unique(labels), labels) class_w = dict(enumerate(class_w)) print("Class weights: " + str(class_w)) for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True): train, test = get_holdout( train_index, test_index, bed, labels, genome, 1024) print("="*80) for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): if precomputed(results, model.name, i): continue history = model.fit( train, steps_per_epoch=train.steps_per_epoch, validation_data=test, validation_steps=test.steps_per_epoch, epochs=1000, shuffle=True, verbose=False, class_weight=class_w, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50, restore_best_weights=True), ] ).history scores = pd.DataFrame(history).iloc[-1].to_dict() results.append({ "model": model.name, "run_type": "train", "holdout": i, **{ key: value for key, value in scores.items() if not key.startswith("val_") } }) results.append({ "model": model.name, "run_type": "test", "holdout": i, **{ key[4:]: value for key, value in scores.items() if key.startswith("val_") } }) compress_json.local_dump( results, cell_line + "_" + region_type + "_sequence.json") df = pd.DataFrame(results).drop(columns="holdout") return df
def train_model_epi(models, epigenomes, nlabels, region_type, cell_line): # Reprod os.environ['PYTHONHASHSEED'] = '0' np.random.seed(42) y = nlabels[region_type].values.ravel() X = epigenomes[region_type] print("Num feature: " + str(X.shape[1])) splits = 51 holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42) class_w = class_weight.compute_class_weight('balanced', np.unique(y), y) class_w = dict(enumerate(class_w)) print("Class weights: " + str(class_w)) if os.path.exists(cell_line + "_" + region_type + "_epigenomic.json"): results = compress_json.local_load(cell_line + "_" + region_type + "_epigenomic.json") else: results = [] for i, (train, test) in tqdm(enumerate(holdouts.split(X, y)), total=splits, desc="Computing holdouts", dynamic_ncols=True): for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True): model_name = (model.__class__.__name__ if model.__class__.__name__ != "Sequential" else model.name) if precomputed(results, model_name, i): continue model.fit(X[train], y[train], epochs=1000, shuffle=True, verbose=False, validation_split=0.1, batch_size=1024, class_weight=class_w, callbacks=[ EarlyStopping(monitor="val_loss", mode="min", patience=50, restore_best_weights=True), ]) results.append({ "model": model_name, "run_type": "train", "holdout": i, **report(y[train], model.predict(X[train])) }) results.append({ "model": model_name, "run_type": "test", "holdout": i, **report(y[test], model.predict(X[test])) }) compress_json.local_dump( results, cell_line + "_" + region_type + "_epigenomic.json") df = pd.DataFrame(results) df = df.drop(columns=["holdout"]) return df