Пример #1
0
def train(epigenomes, labels, models, kwargs, region, cell_line):
    epigenomes = epigenomes[region].values
    labels = labels[region]

    splits = 10
    holdouts = StratifiedShuffleSplit(n_splits=splits,
                                      test_size=0.2,
                                      random_state=42)

    if os.path.exists(cell_line + "/results_" + region + ".json"):
        results = compress_json.local_load(cell_line + "/results_" + region +
                                           ".json")
    else:
        results = []

    for i, (train, test) in tqdm(enumerate(holdouts.split(epigenomes, labels)),
                                 total=splits,
                                 desc="Computing holdouts",
                                 dynamic_ncols=True):
        for model, params in tqdm(zip(models, kwargs),
                                  total=len(models),
                                  desc="Training models",
                                  leave=False,
                                  dynamic_ncols=True):
            model_name = (model.__class__.__name__
                          if model.__class__.__name__ != "Sequential" else
                          model.name)
            if precomputed(results, model_name, i):
                continue
            model.fit(epigenomes[train], labels[train], **params)
            results.append({
                "model":
                model_name,
                "run_type":
                "train",
                "holdout":
                i,
                **report(labels[train], model.predict(epigenomes[train]))
            })
            results.append({
                "model":
                model_name,
                "run_type":
                "test",
                "holdout":
                i,
                **report(labels[test], model.predict(epigenomes[test]))
            })
            compress_json.local_dump(
                results, cell_line + "/results_" + region + ".json")

    df = pd.DataFrame(results)
    df = df.drop(columns=["holdout"])
    return df
def training_sequence_models(models, holdouts, cell_line, task):
    results = []
    logging.info("Number of holdouts: {}".format(len(holdouts)))
    for i, (train, test) in enumerate(holdouts):
        for model in tqdm(models,
                          total=len(models),
                          desc="Training models",
                          leave=False,
                          dynamic_ncols=True):
            if __precomputed(results, model.name, i):
                continue

            logging.info("Training model {} holdout {}".format(model.name, i))
            history = model.fit(train,
                                steps_per_epoch=train.steps_per_epoch,
                                validation_data=test,
                                validation_steps=test.steps_per_epoch,
                                epochs=600,
                                shuffle=True,
                                verbose=False,
                                callbacks=[
                                    EarlyStopping(monitor="val_loss",
                                                  mode="min",
                                                  patience=10),
                                ]).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items() if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items() if key.startswith("val_")
                }
            })
            logging.info(
                "Add results {} to Json --> results_sequence_{}.json".format(
                    model.name, task))
            compress_json.local_dump(
                results,
                "json/" + cell_line + "/results_sequence_" + task + ".json")
    return results
Пример #3
0
def test_compress_json():
    D = random_string_dict(10, 10)
    key = sha256(D)
    extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys()
    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.dump(D, path)
        assert key == sha256(compress_json.load(path))

    shutil.rmtree("random_dirs")

    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.local_dump(D, path)
        assert key == sha256(compress_json.local_load(path))

    shutil.rmtree("tests/random_dirs")
Пример #4
0
def predict_sequences(sequences: pd.DataFrame, labels: pd.DataFrame, models: List[Model]) -> pd.DataFrame:
    filename = _get_filename("seq")
    if os.path.exists(filename):
        with open(filename) as json_file:
            results = json.load(json_file)
    else:
        results = []

    for i, (train_index, test_index) in tqdm(enumerate(_get_holdouts().split(sequences, labels)),
                                             total=get_default('splits'), desc="Computing holdouts",
                                             dynamic_ncols=True):
        train, test = get_holdout(train_index, test_index, sequences, labels)
        for model, params in tqdm([model.get_model() for model in models], total=len(models), desc="Training models",
                                  leave=False, dynamic_ncols=True):
            if _precomputed(results, model, i):
                continue

            history = model.fit(
                train,
                validation_data=test,
                steps_per_epoch=train.steps_per_epoch,
                validation_steps=test.steps_per_epoch,
                callbacks=[
                    EarlyStopping(monitor="val_loss", mode="min", patience=get_default('patience'))
                ],
                **params
            ).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()

            results.append(_get_result(model, 'train', i,
                                       **{
                                           key: value
                                           for key, value in scores.items()
                                           if not key.startswith("val_")
                                       }
                                       ))
            results.append(_get_result(model, 'test', i,
                                       **{
                                           key[4:]: value
                                           for key, value in scores.items()
                                           if key.startswith("val_")
                                       }
                                       ))
            compress_json.local_dump(results, filename)
    return pd.DataFrame(results).drop(columns=['holdout'])
def training_tabular_models(holdouts, models, kwargs, cell_line, task):
    results = []
    logging.info("Number of holdouts: {}".format(len(holdouts)))
    for i, (train, test) in enumerate(holdouts):
        for model, params in tqdm(zip(models, kwargs),
                                  total=len(models),
                                  desc="Training models",
                                  leave=False,
                                  dynamic_ncols=True):
            model_name = (model.__class__.__name__
                          if model.__class__.__name__ != "Sequential" else
                          model.name)
            if __precomputed(results, model_name, i):
                continue
            logging.info("Training model {} holdout {}".format(model_name, i))

            model.fit(train["epigenomes"], train["labels"], **params)

            results.append({
                "model":
                model_name,
                "run_type":
                "train",
                "holdout":
                i,
                **__report(train["labels"], model.predict(train["epigenomes"]))
            })
            results.append({
                "model":
                model_name,
                "run_type":
                "test",
                "holdout":
                i,
                **__report(test["labels"], model.predict(test["epigenomes"]))
            })

            logging.info(
                "Add results {} to Json --> results_tabular_{}.json".format(
                    model_name, task))
            compress_json.local_dump(
                results,
                "json/" + cell_line + "/results_tabular_" + task + ".json")
    return results
Пример #6
0
def predict_epigenomics(data: np.ndarray, labels: pd.DataFrame, models: List[Model]) -> pd.DataFrame:
    filename = _get_filename("epi")
    if os.path.exists(filename):
        with open(filename) as json_file:
            results = json.load(json_file)
    else:
        results = []

    for i, (train, test) in tqdm(enumerate(_get_holdouts().split(data, labels)), total=get_default('splits'),
                                 desc="Computing holdouts", dynamic_ncols=True):
        for model, params in tqdm([model.get_model() for model in models], total=len(models), desc="Training models",
                                  leave=False, dynamic_ncols=True):

            if _precomputed(results, model, i):
                continue
            model.fit(data[train], labels[train], **params)
            results.append(_get_result(model, 'train', i, **_report(labels[train], model.predict(data[train]))))
            results.append(_get_result(model, 'test', i, **_report(labels[test], model.predict(data[test]))))
            print("end")
            compress_json.local_dump(results, filename)

    return pd.DataFrame(results).drop(columns=['holdout'])
    def store_graph_data(self, data: Dict, graph_name: str) -> Dict:
        """Return the data stored for the provided graph.

        Parameters
        -----------------------
        graph_name: str,
            Name of graph to store data for.

        Returns
        -----------------------
        The stored data for this graph.
        """
        return compress_json.local_dump(
            data,
            self.get_graph_data_path(graph_name)
        )
Пример #8
0
            buff["state"].append(prev_state.tolist())
            buff["action"].append(action)
            buff["reward"].append(reward)
            buff["done"].append(bool(done))
            buff["next_state"].append(state.tolist())

            buffer_size += 1
            step += 1
            print("\033[0;1;97m", end='')
            print("nb episode : {}     Buffersize : {} / {}   ".format(
                nb_episode, buffer_size, buff_max),
                  end='')
            tac = time.time()
            print("\033[3;91m", end='')
            print("{} secondes".format(int(tac - tic)), end='')
            print("\033[0;m                  \r", end='')

        nb_episode += 1

    print("\033[0;1;32m")
    print("sauvegarde du buffer...")
    if "--save" in sys.argv:
        arg_index = sys.argv.index("--save")
        save_name = sys.argv[arg_index + 1]
        compress_json.local_dump(buff, "preTrain/" + save_name + ".json.gz")
        print('sauvegardé en tant que "' + save_name + '"')
    else:
        compress_json.local_dump(buff, "preTrain/default.json.gz")
        print('sauvegardé en tant que "default"')
 def _dump_unsupported_graphs(self, unsupported_graphs: Set[str]):
     """Return set of known unsupported graphs."""
     compress_json.local_dump(
         unsupported_graphs, self.unsupported_graphs_path)
 def _dump_corrupted_graphs(self, corrupted_graphs: Set[str]):
     """Return set of known corrupted graphs."""
     compress_json.local_dump(corrupted_graphs, self.corrupted_graphs_path)
Пример #11
0
def train_sequence(epigenomes, labels, genome, cell_line, region, models):

    bed = epigenomes[region].reset_index()[epigenomes[region].index.names]

    splits = 2
    holdouts = StratifiedShuffleSplit(n_splits=splits,
                                      test_size=0.2,
                                      random_state=42)

    if os.path.exists(cell_line + "/sequence_" + region + ".json"):
        results = compress_json.local_load(cell_line + "/sequence_" + region +
                                           ".json")
    else:
        results = []

    for i, (train_index,
            test_index) in tqdm(enumerate(holdouts.split(bed, labels[region])),
                                total=splits,
                                desc="Computing holdouts",
                                dynamic_ncols=True):
        train, test = get_holdout(train_index, test_index, bed, labels[region],
                                  genome)
        for model in tqdm(models,
                          total=len(models),
                          desc="Training models",
                          leave=False,
                          dynamic_ncols=True):
            if precomputed(results, model.name, i):
                continue
            history = model.fit(train,
                                steps_per_epoch=train.steps_per_epoch,
                                validation_data=test,
                                validation_steps=test.steps_per_epoch,
                                epochs=100,
                                shuffle=True,
                                verbose=False,
                                callbacks=[
                                    EarlyStopping(monitor="val_loss",
                                                  mode="min",
                                                  patience=50),
                                ]).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items() if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items() if key.startswith("val_")
                }
            })
            compress_json.local_dump(
                results, cell_line + "/sequence_" + region + ".json")

    df = pd.DataFrame(results)
    df = df.drop(columns=["holdout"])
    return df
Пример #12
0
def train_model_seq(models, epigenomes, nlabels, region_type, cell_line):
    # Reprod
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)

    splits = 11
    holdouts = StratifiedShuffleSplit(
        n_splits=splits, test_size=0.2, random_state=42)
    genome = Genome("hg19")
    bed = to_bed(epigenomes[region_type])
    labels = nlabels[region_type].values.ravel()
    if os.path.exists(cell_line + "_" + region_type + "_sequence.json"):
        results = compress_json.local_load(
            cell_line + "_" + region_type + "_sequence.json")
    else:
        results = []
    class_w = class_weight.compute_class_weight(
        'balanced', np.unique(labels), labels)
    class_w = dict(enumerate(class_w))
    print("Class weights: " + str(class_w))

    for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
        train, test = get_holdout(
            train_index, test_index, bed, labels, genome, 1024)
        print("="*80)
        for model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
            if precomputed(results, model.name, i):
                continue
            history = model.fit(
                train,
                steps_per_epoch=train.steps_per_epoch,
                validation_data=test,
                validation_steps=test.steps_per_epoch,
                epochs=1000,
                shuffle=True,
                verbose=False,
                class_weight=class_w,
                callbacks=[
                    EarlyStopping(monitor="val_loss", mode="min",
                                  patience=50, restore_best_weights=True),
                ]
            ).history
            scores = pd.DataFrame(history).iloc[-1].to_dict()
            results.append({
                "model": model.name,
                "run_type": "train",
                "holdout": i,
                **{
                    key: value
                    for key, value in scores.items()
                    if not key.startswith("val_")
                }
            })
            results.append({
                "model": model.name,
                "run_type": "test",
                "holdout": i,
                **{
                    key[4:]: value
                    for key, value in scores.items()
                    if key.startswith("val_")
                }
            })
            compress_json.local_dump(
                results, cell_line + "_" + region_type + "_sequence.json")
            df = pd.DataFrame(results).drop(columns="holdout")
    return df
Пример #13
0
def train_model_epi(models, epigenomes, nlabels, region_type, cell_line):
    # Reprod
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(42)

    y = nlabels[region_type].values.ravel()
    X = epigenomes[region_type]
    print("Num feature: " + str(X.shape[1]))
    splits = 51
    holdouts = StratifiedShuffleSplit(n_splits=splits,
                                      test_size=0.2,
                                      random_state=42)
    class_w = class_weight.compute_class_weight('balanced', np.unique(y), y)
    class_w = dict(enumerate(class_w))
    print("Class weights: " + str(class_w))

    if os.path.exists(cell_line + "_" + region_type + "_epigenomic.json"):
        results = compress_json.local_load(cell_line + "_" + region_type +
                                           "_epigenomic.json")
    else:
        results = []

    for i, (train, test) in tqdm(enumerate(holdouts.split(X, y)),
                                 total=splits,
                                 desc="Computing holdouts",
                                 dynamic_ncols=True):
        for model in tqdm(models,
                          total=len(models),
                          desc="Training models",
                          leave=False,
                          dynamic_ncols=True):
            model_name = (model.__class__.__name__
                          if model.__class__.__name__ != "Sequential" else
                          model.name)
            if precomputed(results, model_name, i):
                continue

            model.fit(X[train],
                      y[train],
                      epochs=1000,
                      shuffle=True,
                      verbose=False,
                      validation_split=0.1,
                      batch_size=1024,
                      class_weight=class_w,
                      callbacks=[
                          EarlyStopping(monitor="val_loss",
                                        mode="min",
                                        patience=50,
                                        restore_best_weights=True),
                      ])
            results.append({
                "model": model_name,
                "run_type": "train",
                "holdout": i,
                **report(y[train], model.predict(X[train]))
            })
            results.append({
                "model": model_name,
                "run_type": "test",
                "holdout": i,
                **report(y[test], model.predict(X[test]))
            })
            compress_json.local_dump(
                results, cell_line + "_" + region_type + "_epigenomic.json")
            df = pd.DataFrame(results)
            df = df.drop(columns=["holdout"])

    return df