Exemplo n.º 1
0
########### Inputs and labels ############
##########################################

print("Loading inputs and labels")

# Vector space model of GloVe embeddings trained on article full texts
vsm = pd.read_csv("../data/text/glove_gen_n100_win15_min5_iter500_190428.txt",
                  sep=" ",
                  index_col=0,
                  header=0)
n_vocab, n_emb = vsm.shape

# Document-term matrix generated from article full texts or titles
dtm = {}
for inp in inputs:
    dtm_inp = utilities.load_doc_term_matrix(path="../", inputs=inp)
    dtm[inp] = dtm_inp[dtm_inp.columns.intersection(vsm.index)]
X = dtm

# Output labels are brain activation coordinates
Y = utilities.load_coordinates(path="../")
m, n_structs = Y.shape

# Splits of the article PMIDs
splits = {}
for split in ["train", "dev", "test"]:
    splits[split] = [
        int(pmid.strip()) for pmid in open(
            "../data/splits/{}.txt".format(split), "r").readlines()
    ]
Exemplo n.º 2
0
def train_classifier(framework,
                     direction,
                     suffix="",
                     clf="",
                     dtm_version=190325):

    fit_file = "fits/{}_{}.p".format(framework, direction)
    if not os.path.isfile(fit_file):

        # Load the data splits
        splits = {}
        for split in ["train", "validation"]:
            splits[split] = [
                int(pmid.strip()) for pmid in open(
                    "../../data/splits/{}.txt".format(split), "r").readlines()
            ]

        # Load the activation coordinate and text data
        act_bin = utilities.load_coordinates(path="../../data")
        dtm_bin = utilities.load_doc_term_matrix(version=dtm_version,
                                                 binarize=True,
                                                 path="../../data")

        # Score the texts using the framework
        lists, circuits = utilities.load_framework(framework,
                                                   suffix=suffix,
                                                   clf=clf,
                                                   path="../../ontology")
        scores = utilities.score_lists(lists, dtm_bin)

        # Specify the hyperparameters for the randomized grid search
        param_grid = {
            "penalty": ["l1", "l2"],
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            "fit_intercept": [True, False]
        }
        param_list = list(
            ParameterSampler(param_grid, n_iter=28, random_state=42))
        max_iter = 1000

        # Split the train set into batches and load the validation set as a batch
        if direction == "forward":
            train_set = [
                scores.loc[splits["train"]], act_bin.loc[splits["train"]]
            ]
            val_set = [
                scores.loc[splits["validation"]],
                act_bin.loc[splits["validation"]]
            ]

        elif direction == "reverse":
            train_set = [
                act_bin.loc[splits["train"]], scores.loc[splits["train"]]
            ]
            val_set = [
                act_bin.loc[splits["validation"]],
                scores.loc[splits["validation"]]
            ]

        # Search for the optimal hyperparameter combination
        op_fit = optimize_hyperparameters(param_list,
                                          train_set,
                                          val_set,
                                          max_iter=max_iter)

        # Export the optimized results
        pickle.dump(op_fit, open(fit_file, "wb"), protocol=2)
Exemplo n.º 3
0
args = parser.parse_args()
data = args.data

if data not in ["titles", "texts"]:
    raise ValueError("""An invalid option for `--data` was supplied,
					 options are ['titles', 'texts']""")

# Load the GloVe vector space model
vsm = pd.read_csv("../data/text/glove_gen_n100_win15_min5_iter500_190428.txt",
                  sep=" ",
                  index_col=0,
                  header=0)
n_vocab, n_emb = vsm.shape

# Load the term matrix
X = utilities.load_doc_term_matrix(path="../", inputs=data)
X = X[X.columns.intersection(vsm.index)]
m, n_terms = X.shape
lexicon = list(X.columns)
vsm = vsm.loc[lexicon]

# Load the data splits
splits = utilities.load_splits(splits=["train", "dev"], path="../", limit=5000)


# Zero out embeddings for terms that did not occur in articles
def load_emb(split):
    emb = np.zeros((n_terms, n_emb, len(splits[split])))
    occ = X.loc[splits[split]]
    for i, pmid in enumerate(splits[split]):
        terms = occ.columns[occ.values[i, :] == 0]
Exemplo n.º 4
0
def train_classifier(framework,
                     direction,
                     suffix="",
                     clf="",
                     dtm_version=190325,
                     opt_epochs=500,
                     train_epochs=1000,
                     use_hyperparams=False):

    # Load the data splits
    splits = {}
    for split in ["train", "validation"]:
        splits[split] = [
            int(pmid.strip()) for pmid in open(
                "../../data/splits/{}.txt".format(split), "r").readlines()
        ]

    # Load the activation coordinate and text data
    act_bin = utilities.load_coordinates(path="../../data")
    dtm_bin = utilities.load_doc_term_matrix(version=dtm_version,
                                             binarize=True,
                                             path="../../data")

    # Score the texts using the framework
    lists, circuits = utilities.load_framework(framework,
                                               suffix=suffix,
                                               clf=clf,
                                               path="../../ontology")
    scores = utilities.score_lists(lists, dtm_bin)

    # If hyperparameters have already been optimizd, use them
    param_file = "../data/params_{}_{}_{}epochs.csv".format(
        framework, direction, opt_epochs)
    if use_hyperparams:
        params = pd.read_csv(param_file, header=None, index_col=0)
        param_grid = {
            "lr": [float(params.loc["lr"])],
            "weight_decay": [float(params.loc["weight_decay"])],
            "n_hid": [int(params.loc["n_hid"])],
            "p_dropout": [float(params.loc["p_dropout"])]
        }
        param_list = list(
            ParameterSampler(param_grid, n_iter=1, random_state=42))
        n_epochs = train_epochs

    # Otherwise, specify hyperparameters for a randomized grid search
    elif not use_hyperparams:
        param_grid = {
            "lr": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
            "weight_decay": [0.00001, 0.0001, 0.001, 0.01, 0.1],
            "n_hid": [25, 50, 75, 100, 125, 150],
            "p_dropout": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        }
        param_list = list(
            ParameterSampler(param_grid, n_iter=100, random_state=42))
        n_epochs = opt_epochs
    batch_size = 1024

    # Split the train set into batches and load the validation set as a batch
    if direction == "forward":
        train_set = load_mini_batches(scores,
                                      act_bin,
                                      splits["train"],
                                      mini_batch_size=batch_size,
                                      seed=42)
        val_set = load_mini_batches(scores,
                                    act_bin,
                                    splits["validation"],
                                    mini_batch_size=len(splits["validation"]),
                                    seed=42)

    elif direction == "reverse":
        train_set = load_mini_batches(act_bin,
                                      scores,
                                      splits["train"],
                                      mini_batch_size=batch_size,
                                      seed=42)
        val_set = load_mini_batches(act_bin,
                                    scores,
                                    splits["validation"],
                                    mini_batch_size=len(splits["validation"]),
                                    seed=42)

    # Search for the optimal hyperparameter combination
    op_state_dict, op_params, op_loss = optimize_hyperparameters(
        param_list, train_set, val_set, n_epochs=n_epochs)

    # Export the trained neural network
    fit_file = "../fits/{}_{}_{}epochs.pt".format(framework, direction,
                                                  n_epochs)
    torch.save(op_state_dict, fit_file)

    # Export the hyperparameters
    with open(param_file, "w+") as file:
        file.write("\n".join(
            ["{},{}".format(param, val) for param, val in op_params.items()]))

    # Export the loss over epochs
    loss_file = "../data/loss_{}_{}_{}epochs.csv".format(
        framework, direction, n_epochs)
    pd.DataFrame(op_loss, index=None, columns=["LOSS"]).to_csv(loss_file)