예제 #1
0
    def __init__(self, train_data_path, val_data_path=None, additional_data_paths=None, hyperparams={},
                 preprocessing_function=None, log_dir="./", use_gpu=False, verbose=True, remove_extra_labels=True):
        """
        Train a supervised fasttext model
        :param train_data_path: str, path to train.txt file
        :param val_data_path: str, path to val.txt file. if val_data_path is None the score won't be keeped in
        history.json
        :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file
        :param hyperparams: dict, all hyperparams for train_supervised
        :param preprocessing_function: function, function to apply on text data before feeding into network
        :param log_dir: str, directory to save the training files and the model
        :param use_gpu: bool, use gpu for training
        :param verbose: bool
        :param remove_extra_labels: bool, remove datapoints with labels which appear in additional_data_paths but not in
        train_data_path. Ignored if additional_data_paths is None
        :return: object, the trained model
        """
        log_dir = validate(log_dir)
        self.hyperparams = \
            {"train_path": handle_space_paths("./train.txt"),
             "validation_path": handle_space_paths(""),
             "min_word_count": 1,
             "min_label_count": 1,
             "label_prefix": "__label__",
             "dim": 100,
             "n_epochs": 10,
             "word_ngrams": 1,
             "sort_ngrams": 0,
             "batch_size": 1024,
             "batch_size_inference": 1024,
             "batch_norm": 0,
             "seed": 17,
             "top_k": 5,
             "learning_rate": 0.3,
             "learning_rate_multiplier": 0.8,
             "dropout": 0.5,
             "l2_reg_weight": 1e-06,
             "data_fraction": 1,
             "save_models": 0,
             "use_validation": 0,
             "use_gpu": 0,
             "gpu_fraction": 0.5,
             "force": 0,
             "cache_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "cache"))),
             "result_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "results"))),
             "flush": 1}

        assert os.path.exists(train_data_path), "train_data_path is incorrect"
        if val_data_path:
            assert os.path.exists(val_data_path), "val_data_path is incorrect"
            self.hyperparams["use_validation"] = 1
            self.hyperparams["validation_path"] = val_data_path

        to_restore = {}
        if len(hyperparams) != 0:
            for k, v in hyperparams.items():
                if k not in self.hyperparams:
                    to_restore[k] = v
                    if k != "split_and_train_params":
                        print("WARNING! {} not in hyperparams, ignoring it".format(k))
                else:
                    if k in ["train_path", "validation_path", "cache_dir", "result_dir"]:
                        self.hyperparams[k] = handle_space_paths(v)
                    else:
                        self.hyperparams[k] = v

        train_data_path = os.path.abspath(train_data_path)
        if additional_data_paths:
            data_to_save = []
            paths_joined_hashed = hash_(" ".join(additional_data_paths))
            concat_path = "/tmp/tmp.txt"
            joined_path = "/tmp/{}.txt".format(paths_joined_hashed)
            os.system("cat {} {} > {}".format(train_data_path, val_data_path, concat_path))
            _, all_labels = parse_txt(train_data_path)
            unique_labels = set(all_labels)
            assert type(additional_data_paths) == list, "type of additional_data_paths should be list"
            for additional_data_path in additional_data_paths:
                assert os.path.exists(additional_data_path), "val_data_path is incorrect"
                current_data, current_labels = parse_txt(additional_data_path, join_desc=True)
                if remove_extra_labels:
                    needed_inds = [i for i, j in enumerate(current_labels) if j in unique_labels]
                    current_data = [current_data[i] for i in needed_inds]
                    current_labels = [current_labels[i] for i in needed_inds]
                data_to_save.extend(["{}{} {}".format(self.hyperparams["label_prefix"], i, j) for i, j
                                     in zip(current_labels, current_data)])
            with open(concat_path, "w+") as outfile:
                outfile.write("\n".join(data_to_save))
            os.system("cat {} {} > {}".format(concat_path, train_data_path, joined_path))
            self.hyperparams["train_path"] = joined_path
            to_restore["original_train_path"] = train_data_path
            to_restore["additional_data_paths"] = additional_data_paths
        else:
            self.hyperparams["train_path"] = train_data_path

        if use_gpu:
            self.hyperparams["use_gpu"] = 1

        command = self._get_command()
        process = Popen(command, stdout=PIPE, shell=True, stderr=STDOUT, bufsize=1, close_fds=True)

        for line in iter(process.stdout.readline, b""):
            line = line.rstrip().decode("utf-8")
            if "stored at" in line:
                log_dir_line = line

            if "accuracy" in line:
                line_split = line.split()
                if "val" in line:
                    self.top_1_accuracy = float(line_split[-4][:-1])
                    self.top_k_accuracy = float(line_split[-1])
                else:
                    if str(1) in line.split():
                        self.top_1_accuracy = float(line_split[-1])
                    if str(self.hyperparams["top_k"]) in line.split():
                        self.top_k_accuracy = float(line_split[-1])

            if verbose:
                print(line)
        process.stdout.close()

        log_dir_split = log_dir_line.split("at ")
        for k, v in to_restore.items():
            self.hyperparams[k] = v
        super(train_supervised, self). \
            __init__(model_path=os.path.join(log_dir_split[-1], "model_ep{}.pb".format(self.hyperparams["n_epochs"])),
                     model_params_path=os.path.join(log_dir_split[-1], "model_params.json"),
                     use_gpu=use_gpu, label_prefix=self.hyperparams["label_prefix"],
                     preprocessing_function=preprocessing_function,
                     hyperparams=self.hyperparams)
예제 #2
0
def run_train(data, train_specific, train_params, data_specific, train_history,
              train_history_path):
    """
    Run training with the given data, parameters and hyperparameters
    :param data: dict, data
    :param train_specific: dict, train hyper-parameters
    :param train_params: dict, train parameters
    :param data_specific: dict, data-specific parameters
    :param train_history: dict, train history
    :param train_history_path: str, path to train history
    :return: None, prints the training outputs
    """

    seed = train_specific["seed"]
    learning_rate = train_specific["learning_rate"]
    embedding_dim = train_specific["embedding_dim"]
    use_batch_norm = train_specific["use_batch_norm"]
    l2_reg_weight = train_specific["l2_reg_weight"]
    num_epochs = train_specific["num_epochs"]
    batch_size = train_specific["batch_size"]
    train_dropout_keep_rate = train_specific["dropout"]
    learning_rate_multiplier = train_specific["learning_rate_multiplier"]
    cache_dir = train_specific["cache_dir"]
    train_path = train_specific["train_path"]
    del train_specific["train_path"]

    train_description_hashes = data["train_description_hashes"]
    train_labels = data["train_labels"]
    test_description_hashes = data["test_description_hashes"]
    test_labels = data["test_labels"]
    label_vocab = data["label_vocab"]
    cache = data["cache"]
    num_words_in_train = data["num_words_in_train"]
    test_path = data["test_path"]
    initial_test_len = data["initial_test_len"]
    num_labels = len(label_vocab)

    use_gpu = train_params["use_gpu"]
    gpu_fraction = train_params["gpu_fraction"]
    use_tensorboard = train_params["use_tensorboard"]
    top_k = train_params["top_k"]
    save_all_models = train_params["save_all_models"]
    compare_top_k = train_params["compare_top_k"]
    use_test = train_params["use_test"]
    log_dir = train_params["log_dir"]
    batch_size_inference = train_params["batch_size_inference"]
    progress_bar = train_params["progress_bar"]
    flush = train_params["flush"]

    hyperparameter_hash = hash_("".join(
        [str(hyperparam) for hyperparam in train_specific.values()]))

    if use_gpu:
        device = "/gpu:0"
        config = tf.ConfigProto(
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(
                per_process_gpu_memory_fraction=gpu_fraction,
                allow_growth=True))
    else:
        device = "/cpu:0"
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
        config = tf.ConfigProto(allow_soft_placement=True)

    with tf.device(device):
        with tf.Session(config=config) as sess:
            input_placholder = tf.placeholder(tf.int32,
                                              shape=[None, None],
                                              name="input")
            weights_placeholder = tf.placeholder(tf.float32,
                                                 shape=[None, None],
                                                 name="input_weights")
            labels_placeholder = tf.placeholder(tf.int32,
                                                shape=[None],
                                                name="label")
            learning_rate_placeholder = tf.placeholder_with_default(
                learning_rate, shape=[], name="learning_rate")
            dropout_drop_rate_placeholder = tf.placeholder_with_default(
                0., shape=[], name="dropout_rate")
            is_training = tf.placeholder_with_default(False,
                                                      shape=[],
                                                      name="do_dropout")

            tf.set_random_seed(seed)

            with tf.name_scope("embeddings"):
                token_embeddings = tf.Variable(tf.random.uniform(
                    [num_words_in_train, embedding_dim]),
                                               name="embedding_matrix")

            with tf.name_scope("mean_sentence_embedding"):
                gathered_embeddings = tf.gather(token_embeddings,
                                                input_placholder)
                weights_broadcasted = tf.expand_dims(weights_placeholder,
                                                     axis=2)
                mean_embedding = tf.reduce_sum(tf.multiply(
                    weights_broadcasted, gathered_embeddings),
                                               axis=1,
                                               name="sentence_embedding")
            if use_batch_norm:
                mean_embedding = tf.layers.batch_normalization(
                    mean_embedding, training=is_training)
            mean_embedding_dropout = tf.layers.dropout(
                mean_embedding,
                rate=dropout_drop_rate_placeholder,
                training=is_training)
            logits = tf.layers.dense(
                mean_embedding_dropout,
                num_labels,
                use_bias=False,
                kernel_initializer=tf.truncated_normal_initializer(),
                name="logits")
            output = tf.nn.softmax(logits, name="prediction")
            # this is not used in the training, but will be used for inference

            with tf.name_scope("Accuracy"):
                correctly_predicted = tf.nn.in_top_k(logits,
                                                     labels_placeholder,
                                                     1,
                                                     name="Top_1")
                correctly_predicted_top_k = tf.nn.in_top_k(logits,
                                                           labels_placeholder,
                                                           top_k,
                                                           name="Top_k")

            if use_tensorboard:
                train_writer = tf.summary.FileWriter(
                    os.path.join(log_dir, "Train"), sess.graph)
                train_end_writer = tf.summary.FileWriter(
                    os.path.join(log_dir, "End_epoch_train"))

            if use_test:
                batch_counter = 0
                if use_tensorboard:
                    test_writer = tf.summary.FileWriter(
                        os.path.join(log_dir, "Test"))
                    test_end_writer = tf.summary.FileWriter(
                        os.path.join(log_dir, "End_epoch_test"))

            ce_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels_placeholder, logits=logits),
                name="CE_loss")
            l2_vars = tf.trainable_variables()
            l2_loss = tf.multiply(tf.add_n([tf.nn.l2_loss(v)
                                            for v in l2_vars]),
                                  l2_reg_weight,
                                  name="L2_loss")
            total_loss = tf.add(ce_loss, l2_loss, name="Total_loss")

            if use_tensorboard:
                tf.summary.scalar("Cross_entropy_loss", ce_loss)
                summary_op = tf.summary.merge_all()
            else:
                summary_op = tf.constant(0)

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

            with tf.control_dependencies(update_ops):
                train_op = tf.train.AdamOptimizer(
                    learning_rate_placeholder).minimize(total_loss)
            sess.run(tf.global_variables_initializer())

            iteration = 0
            train_start = time.time()
            best_score, best_scores = -1, {1: None, top_k: None}
            logs = {1: [], top_k: [], "best": -1}

            for epoch in range(1, num_epochs + 1):
                start_iteration = iteration
                print("\n\nEpoch {}".format(epoch), flush=flush)
                end_epoch_accuracy, end_epoch_accuracy_k, end_epoch_loss, end_epoch_l2_loss, losses = [], [], [], [], []

                for batch, batch_weights, batch_labels in \
                        batch_generator(train_description_hashes, train_labels, batch_size, label_vocab, cache,
                                        shuffle=True, show_progress=progress_bar, progress_desc="Fit train"):

                    _, train_summary, _loss, correct, correct_k, batch_loss, batch_l2 = \
                        sess.run([train_op, summary_op, total_loss, correctly_predicted,
                                  correctly_predicted_top_k, ce_loss, l2_loss],
                                 feed_dict={input_placholder: batch,
                                            weights_placeholder: batch_weights,
                                            labels_placeholder: batch_labels,
                                            learning_rate_placeholder: learning_rate,
                                            dropout_drop_rate_placeholder: 1 - train_dropout_keep_rate,
                                            is_training: True})
                    if use_tensorboard:
                        train_writer.add_summary(train_summary, iteration)

                    losses.append(_loss)
                    end_epoch_accuracy.extend(correct)
                    end_epoch_accuracy_k.extend(correct_k)
                    end_epoch_loss.append(batch_loss)
                    end_epoch_l2_loss.append(batch_l2)
                    iteration += 1

                end_iteration = iteration
                print("Current learning rate: {}".format(
                    round(learning_rate, 7)),
                      flush=flush)
                learning_rate *= learning_rate_multiplier
                mean_loss = percent_array(losses)
                if np.isnan(mean_loss):
                    print("Loss is NaN. Try using smaller learning rate")
                    exit()
                print("Moving mean loss: {}".format(mean_loss), flush=flush)

                mean_accuracy = percent_array(end_epoch_accuracy)
                mean_accuracy_k = percent_array(end_epoch_accuracy_k)

                if use_tensorboard:
                    write_summaries(end_epoch_loss, mean_accuracy,
                                    mean_accuracy_k, top_k, train_end_writer,
                                    epoch)
                    summary_loss_l2 = tf.Summary(value=[
                        tf.Summary.Value(
                            tag="L2", simple_value=np.mean(end_epoch_l2_loss))
                    ])
                    train_end_writer.add_summary(summary_loss_l2, epoch)
                print("Train moving accuracy: {}, top {}: {}".format(
                    mean_accuracy, top_k, mean_accuracy_k),
                      flush=flush)

                if use_test:
                    num_test_iterations = int(
                        np.ceil(len(test_labels) / batch_size_inference))
                    test_iterations = np.linspace(start_iteration,
                                                  end_iteration,
                                                  num_test_iterations)
                    end_epoch_accuracy, end_epoch_accuracy_k, end_epoch_loss = [], [], []

                    for index, (batch, batch_weights,
                                batch_labels) in enumerate(
                                    batch_generator(test_description_hashes,
                                                    test_labels,
                                                    batch_size_inference,
                                                    label_vocab,
                                                    cache,
                                                    show_progress=progress_bar,
                                                    progress_desc="Test")):
                        correct, correct_k, batch_loss, test_summary = sess.run(
                            [
                                correctly_predicted, correctly_predicted_top_k,
                                ce_loss, summary_op
                            ],
                            feed_dict={
                                input_placholder: batch,
                                weights_placeholder: batch_weights,
                                labels_placeholder: batch_labels
                            })
                        if use_tensorboard:
                            test_writer.add_summary(
                                test_summary, int(test_iterations[index]))

                        end_epoch_accuracy.extend(correct)
                        end_epoch_accuracy_k.extend(correct_k)
                        end_epoch_loss.append(batch_loss)
                        batch_counter += 1

                    mean_accuracy = np.round(
                        100 * np.sum(end_epoch_accuracy) / initial_test_len, 2)
                    mean_accuracy_k = np.round(
                        100 * np.sum(end_epoch_accuracy_k) / initial_test_len,
                        2)
                    if use_tensorboard:
                        write_summaries(end_epoch_loss, mean_accuracy,
                                        mean_accuracy_k, top_k,
                                        test_end_writer, epoch)
                    print("Test accuracy: {}, top {}: {}".format(
                        mean_accuracy, top_k, mean_accuracy_k),
                          flush=flush)

                logs[1].append(mean_accuracy)
                logs[top_k].append(mean_accuracy_k)

                comparable = mean_accuracy
                if compare_top_k:
                    comparable = mean_accuracy_k

                if comparable > best_score:
                    best_score = comparable
                    best_scores[1] = mean_accuracy
                    best_scores[top_k] = mean_accuracy_k
                    freeze_save_graph(sess, log_dir, "model_best.pb",
                                      "prediction")
                    logs["best"] = epoch

                if save_all_models:
                    freeze_save_graph(sess, log_dir,
                                      "model_ep{}.pb".format(epoch),
                                      "prediction")
                else:
                    if epoch == num_epochs:
                        freeze_save_graph(sess, log_dir,
                                          "model_ep{}.pb".format(epoch),
                                          "prediction")
                iteration += 1

            print("Best model mean test accuracy: {}, top {}: {}".format(
                logs[1][logs["best"] - 1], top_k,
                logs[top_k][logs["best"] - 1]),
                  flush=flush)
            print("The model is stored at {}".format(log_dir), flush=flush)
            if use_test:
                results = {
                    "hyperparams": train_specific,
                    "scores": {
                        test_path: best_scores
                    }
                }
            else:
                results = {
                    "hyperparams": train_specific,
                    "scores": {
                        train_path: best_scores
                    }
                }
            train_history[hyperparameter_hash] = results

            with open(os.path.join(log_dir, "results.json"), "w+") as outfile:
                json.dump(results, outfile)
            with open(os.path.join(cache_dir, "details.json"),
                      "w+") as outfile:
                json.dump(data_specific, outfile)
            with open(train_history_path, "w+") as outfile:
                json.dump(train_history, outfile)
            with open(os.path.join(log_dir, "accuracy_logs.json"),
                      "w+") as outfile:
                json.dump(logs, outfile)

            print("The training took {} seconds".format(
                round(time.time() - train_start, 0)),
                  flush=flush)
    print("Peak memory usage: {}".format(
        round(tracemalloc.get_traced_memory()[1] / 1e6, 0)),
          flush=flush)
def cache_data(descriptions,
               labels,
               word_vocab,
               label_vocab,
               word_ngrams,
               sort_ngrams,
               cache=None,
               is_test_data=False,
               show_progress=True,
               progress_desc=None,
               print_postfix="\n",
               flush=False):
    """
    Cache data in order not to do repetitive work
    :param descriptions: list, hashed strings of the input data
    :param labels: list
    :param word_vocab: dict, mapping of words and n-grams to their indices
    :param label_vocab: dict, mapping of labels to their indices
    :param word_ngrams: int
    :param sort_ngrams: bool
    :param cache: dict
    :param is_test_data: bool
    :param show_progress: bool, show progress bar
    :param progress_desc: str, description for progress bar
    :param print_postfix: str
    :param flush: bool, flush after printing
    :return: tuple, (description hashes, labels, cache)
    """
    if cache is None:
        cache = dict()

    description_hashes, labels2 = [], []

    descriptions_thrown, labels_thrown = 0, 0
    disable_progressbar = not show_progress
    if disable_progressbar:
        if progress_desc:
            print(progress_desc, flush=flush)
    for description, label in \
            zip(tqdm(descriptions, disable=disable_progressbar, desc=progress_desc, file=sys.stdout), labels):

        phrase_indices = [0] + [
            word_vocab[phrase]["id"]
            for phrase in get_all(description, word_ngrams, sort_ngrams)
            if phrase in word_vocab
        ]
        if len(phrase_indices) == 1:
            descriptions_thrown += 1
            continue

        if label not in label_vocab:
            if is_test_data:
                labels_thrown += 1
                continue

        tmp_hash = hash_(str(description))
        if tmp_hash not in cache:
            desc_weights = [
                1. / len(phrase_indices) for _ in range(len(phrase_indices))
            ]
            cache[tmp_hash] = {"i": phrase_indices, "w": desc_weights}
        labels2.append(label)
        description_hashes.append(tmp_hash)

    if labels_thrown > 0:
        print("{} datapoints thrown because of empty description".format(
            descriptions_thrown),
              flush=flush)
        print("{} datapoints thrown because of label {}".format(
            labels_thrown, print_postfix),
              flush=flush)
    else:
        print("{} datapoints thrown because of empty description {}".format(
            descriptions_thrown, print_postfix),
              flush=flush)
    return description_hashes, labels2, cache
예제 #4
0
 def password(self, pass_):
     self._password = hash_(pass_)
예제 #5
0
 def authenticate(self, pass_):
     return self.password == hash_(pass_)
예제 #6
0
 def __init__(self, *args, **kwargs):
     for p in ['password', '_password']:
         if p in kwargs:
             self._password = hash_(kwargs[p])
             del (kwargs[p])
     super(User, self).__init__(*args, **kwargs)
예제 #7
0
    def __init__(self,
                 train_path,
                 test_path=None,
                 additional_data_paths=None,
                 hyperparams=None,
                 preprocessing_function=None,
                 log_dir="./",
                 use_gpu=False,
                 gpu_fraction=0.5,
                 verbose=True,
                 remove_extra_labels=True,
                 force=False):
        """
        Train a supervised fasttext model
        :param train_path: str, path to train file
        :param test_path: str or None, path to test file, if None training will be done without test
        :param additional_data_paths: list of str, paths of fasttext format additional data to concat with train file
        :param hyperparams: dict, all hyperparams for train_supervised
        :param preprocessing_function: function, function to apply on text data before feeding into network
        :param log_dir: str, directory to save the training files and the model
        :param use_gpu: bool, use gpu for training
        :param gpu_fraction: float, gpu fraction to allocate
        :param remove_extra_labels: bool, remove data from additional paths, which have labels not contained in
            train.txt
        :param verbose: bool
        :param remove_extra_labels: bool, remove datapoints with labels which appear in additional_data_paths but not in
        train_data_path. Ignored if additional_data_paths is None
        :param force: bool, forced training
        :return: object, the trained model
        """
        log_dir = validate(log_dir)

        # defualt hyperparams
        self.hyperparams = \
            {"train_path": '',
             "test_path": '',
             "label_prefix": "__label__",
             "data_fraction": 1,
             "seed": 17,
             "embedding_dim": 100,
             "num_epochs": 10,
             "word_ngrams": 1,
             "sort_ngrams": 0,
             "batch_size": 4096,
             "use_batch_norm": 0,
             "min_word_count": 1,
             "learning_rate": 0.1,
             "learning_rate_multiplier": 0.8,
             "dropout": 0.5,
             "l2_reg_weight": 1e-06,
             "batch_size_inference": 4096,
             "top_k": 3,
             "compare_top_k": 0,
             "save_all_models": 0,
             "use_test": 0,
             "use_gpu": 0,
             "gpu_fraction": 0.5,
             "cache_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "cache"))),
             "log_dir": handle_space_paths(os.path.abspath(os.path.join(log_dir, "results"))),
             "force": 0,
             "progress_bar": 1,
             "flush": 1}

        if not os.path.exists(train_path):
            raise FileNotFoundError("train_path is incorrect")
        if test_path:
            if not os.path.exists(test_path):
                raise FileNotFoundError("test_path is incorrect")

        if preprocessing_function and verbose:
            print("Preprocessing train data ...")
        to_restore = dict()

        if hyperparams is None:
            hyperparams = dict()

        do_preprocessing = preprocessing_function is not None

        if len(hyperparams) != 0:
            for key, value in hyperparams.items():
                if key not in self.hyperparams:
                    to_restore[key] = value
                    print("WARNING! {} not in hyperparams, ignoring it".format(
                        key))
                else:
                    if key in ["cache_dir", "log_dir"]:
                        self.hyperparams[key] = handle_space_paths(value)
                    else:
                        self.hyperparams[key] = value
        train_path = os.path.abspath(train_path)
        if additional_data_paths:
            data_to_save = []
            paths_joined_hashed = hash_(" ".join(additional_data_paths))
            concat_path = "./tmp.txt"
            joined_path = "./{}.txt".format(paths_joined_hashed)
            _, all_labels = parse_txt(train_path)
            unique_labels = np.unique(all_labels)
            if not isinstance(additional_data_paths, list):
                raise ValueError(
                    "Type of additional_data_paths should be list")
            for additional_data_path in additional_data_paths:
                if not os.path.isfile(additional_data_path):
                    raise FileNotFoundError(
                        "{} in additional data paths doesn't exist".format(
                            additional_data_path))
                current_data, current_labels = parse_txt(additional_data_path)
                if remove_extra_labels:
                    needed_mask = np.in1d(current_labels, unique_labels)
                    current_data = [
                        data for data, needed in zip(current_data, needed_mask)
                        if needed
                    ]
                    current_labels = [
                        data
                        for data, needed in zip(current_labels, needed_mask)
                        if needed
                    ]
                if do_preprocessing:
                    data_to_save.extend([
                        "{}{} {}".format(self.hyperparams["label_prefix"],
                                         label, preprocessing_function(data))
                        for label, data in zip(current_labels, current_data)
                    ])
                else:
                    data_to_save.extend([
                        "{}{} {}".format(self.hyperparams["label_prefix"],
                                         label, data)
                        for label, data in zip(current_labels, current_data)
                    ])
            np.savetxt(concat_path, data_to_save, fmt="%s")
            if do_preprocessing:
                prep_train_path = preprocess_data(train_path,
                                                  preprocessing_function)
                os.system("cat {} {} > {}".format(concat_path, prep_train_path,
                                                  joined_path))
                to_restore["original_train_path"] = prep_train_path
            else:
                os.system("cat {} {} > {}".format(concat_path, train_path,
                                                  joined_path))
                to_restore["original_train_path"] = train_path
            self.hyperparams["train_path"] = joined_path
            to_restore["additional_data_paths"] = additional_data_paths
        else:
            if do_preprocessing:
                prep_train_path = preprocess_data(train_path,
                                                  preprocessing_function)
                self.hyperparams["train_path"] = prep_train_path
            else:
                self.hyperparams["train_path"] = train_path

        if preprocessing_function and verbose:
            print("Done!")

        if test_path is not None:
            test_path = os.path.abspath(test_path)
            self.hyperparams["use_test"] = 1
            if do_preprocessing:
                prep_test_path = preprocess_data(test_path,
                                                 preprocessing_function)
                to_restore["original_test_path"] = test_path
                self.hyperparams["test_path"] = prep_test_path
            else:
                self.hyperparams["test_path"] = test_path

        if use_gpu:
            self.hyperparams["use_gpu"] = 1
            self.hyperparams["gpu_fraction"] = gpu_fraction

        if force:
            self.hyperparams["force"] = 1

        # using Popen as calling the command from Jupyter doesn't deallocate GPU memory
        train_command = self._get_train_command()
        process = Popen(train_command,
                        stdout=PIPE,
                        shell=True,
                        stderr=STDOUT,
                        bufsize=1,
                        close_fds=True)
        self.top_1_accuracy, self.top_k_accuracy, log_dir = \
            get_accuracy_log_dir(process, self.hyperparams["top_k"], verbose)

        for key, value in to_restore.items():
            self.hyperparams[key] = value
        super(train_supervised,
              self).__init__(model_path=os.path.join(log_dir, "model_best.pb"),
                             model_params_path=os.path.join(
                                 log_dir, "model_params.json"),
                             use_gpu=use_gpu,
                             gpu_fraction=gpu_fraction,
                             hyperparams=self.hyperparams,
                             label_prefix=self.hyperparams["label_prefix"],
                             preprocessing_function=preprocessing_function)
예제 #8
0
def main():
    main_start = time.time()
    tracemalloc.start()
    parser = argparse.ArgumentParser()

    # data specific parameters
    parser.add_argument("-trp",
                        "--train_path",
                        type=str,
                        required=True,
                        help="path to train file",
                        default="")
    parser.add_argument("-tp",
                        "--test_path",
                        type=str,
                        help="path to test file",
                        default="")
    parser.add_argument("-lp",
                        "--label_prefix",
                        type=str,
                        help="label prefix",
                        default="__label__")
    parser.add_argument("-df",
                        "--data_fraction",
                        type=float,
                        default=1,
                        help="data fraction")
    parser.add_argument("-seed", "--seed", type=int, default=17)

    # hyper-parameters
    parser.add_argument("-dim",
                        "--embedding_dim",
                        type=int,
                        default=100,
                        help="length of embedding vector")
    parser.add_argument("-nep",
                        "--num_epochs",
                        type=int,
                        default=5,
                        help="number of epochs")
    parser.add_argument("-wng",
                        "--word_ngrams",
                        type=int,
                        default=1,
                        help="word ngrams")
    parser.add_argument("-sng",
                        "--sort_ngrams",
                        type=int,
                        default=0,
                        help="sort n-grams alphabetically")
    parser.add_argument("-bs",
                        "--batch_size",
                        type=int,
                        default=4096,
                        help="batch size for train")
    parser.add_argument("-bn",
                        "--use_batch_norm",
                        type=int,
                        default=0,
                        help="use batch norm")
    parser.add_argument(
        "-mwc",
        "--min_word_count",
        type=int,
        default=1,
        help="discard words which appear less than this number")
    parser.add_argument("-lr",
                        "--learning_rate",
                        type=float,
                        default=0.3,
                        help="learning rate")
    parser.add_argument("-lrm",
                        "--learning_rate_multiplier",
                        type=float,
                        default=0.8,
                        help="learning rate multiplier")
    parser.add_argument("-dr",
                        "--dropout",
                        type=float,
                        default=0.5,
                        help="train dropout keep rate")
    parser.add_argument("-l2",
                        "--l2_reg_weight",
                        type=float,
                        default=1e-6,
                        help="regularization weight")

    # parameters
    parser.add_argument("-bsi",
                        "--batch_size_inference",
                        type=int,
                        default=4096,
                        help="batch size for test")
    parser.add_argument("-k",
                        "--top_k",
                        type=int,
                        default=3,
                        help="report results for top k predictions")
    parser.add_argument(
        "-ck",
        "--compare_top_k",
        type=int,
        default=0,
        help="compare top k accuracies for determining the best model")
    parser.add_argument("-sm",
                        "--save_all_models",
                        type=int,
                        default=0,
                        help="save model after each epoch")
    parser.add_argument("-ut",
                        "--use_test",
                        type=int,
                        default=1,
                        help="evaluate on test data")
    parser.add_argument("-gpu",
                        "--use_gpu",
                        type=int,
                        default=0,
                        help="use gpu for training")
    parser.add_argument("-gpu_fr",
                        "--gpu_fraction",
                        type=float,
                        default=0.5,
                        help="what fraction of gpu to allocate")
    parser.add_argument("-utb",
                        "--use_tensorboard",
                        type=int,
                        default=0,
                        help="use tensorboard")
    parser.add_argument("-cd",
                        "--cache_dir",
                        type=str,
                        help="cache directory",
                        default="./cache/")
    parser.add_argument("-ld",
                        "--log_dir",
                        type=str,
                        help="log directory",
                        default="./results/")
    parser.add_argument("-f",
                        "--force",
                        type=int,
                        default=0,
                        help="force retraining")
    parser.add_argument("-pb",
                        "--progress_bar",
                        type=int,
                        default=1,
                        help="show progress bar")
    parser.add_argument("-fl",
                        "--flush",
                        type=int,
                        default=0,
                        help="flush after print")

    args = parser.parse_args()
    for bool_param in [
            args.use_batch_norm, args.save_all_models, args.use_test,
            args.sort_ngrams, args.use_gpu, args.use_tensorboard, args.force,
            args.flush, args.compare_top_k, args.progress_bar
    ]:
        if bool_param not in [0, 1]:
            raise ValueError("{} should be 0 or 1.".format(bool_param))

    train_path = os.path.abspath(args.train_path)
    sort_ngrams = bool(args.sort_ngrams)
    progress_bar = bool(args.progress_bar)
    flush = bool(args.flush)

    use_test = False
    if args.test_path:
        args.test_path = os.path.abspath(args.test_path)
        if bool(args.use_test):
            use_test = True

    print("\n\nTraining with arguments:\n{}\n".format(args))

    cache_dir = validate(args.cache_dir)
    log_dir = validate(args.log_dir)
    train_history_path = os.path.join(log_dir, "history.json")

    np.random.seed(args.seed)

    train_descriptions, train_labels, max_words = \
        parse_txt(train_path, as_tokens=True, return_max_len=True,
                  fraction=args.data_fraction, seed=args.seed, label_prefix=args.label_prefix)

    data_specific = {
        "seed": args.seed,
        "data_fraction": args.data_fraction,
        "min_word_count": args.min_word_count,
        "word_ngrams": args.word_ngrams,
        "sort_ngrams": sort_ngrams,
    }

    data_hash = get_cache_hash(list_of_texts=train_descriptions,
                               data_specific_params=data_specific)
    cache_dir = os.path.abspath(validate(os.path.join(cache_dir, data_hash)))

    train_specific = {
        "embedding_dim": args.embedding_dim,
        "num_epochs": args.num_epochs,
        "batch_size": args.batch_size,
        "learning_rate": args.learning_rate,
        "learning_rate_multiplier": args.learning_rate_multiplier,
        "use_batch_norm": bool(args.use_batch_norm),
        "l2_reg_weight": args.l2_reg_weight,
        "dropout": args.dropout,
        "cache_dir": cache_dir
    }

    for k, v in data_specific.items():
        train_specific[k] = v

    model_params = {
        "word_ngrams":
        args.word_ngrams,
        "sort_ngrams":
        sort_ngrams,
        "word_dict_path":
        os.path.abspath(os.path.join(cache_dir, "word_dict.json")),
        "label_dict_path":
        os.path.abspath(os.path.join(cache_dir, "label_dict.json"))
    }

    hyperparams_hashed = hash_("".join(
        [str(i) for i in train_specific.values()]))
    current_log_dir = validate(os.path.join(log_dir, hyperparams_hashed))
    data_specific["train_path"], train_specific[
        "train_path"] = train_path, train_path

    train_params = {
        "use_gpu": bool(args.use_gpu),
        "gpu_fraction": args.gpu_fraction,
        "use_tensorboard": bool(args.use_tensorboard),
        "top_k": args.top_k,
        "save_all_models": bool(args.save_all_models),
        "compare_top_k": bool(args.compare_top_k),
        "use_test": use_test,
        "log_dir": current_log_dir,
        "batch_size_inference": args.batch_size_inference,
        "progress_bar": progress_bar,
        "flush": flush,
    }

    if os.path.exists(train_history_path):
        with open(train_history_path) as infile:
            train_history = json.load(infile)

        if hyperparams_hashed in train_history and check_model_presence(
                current_log_dir):
            if not bool(args.force):
                if args.test_path:
                    get_accuracy(current_log_dir, train_params,
                                 train_history_path, hyperparams_hashed,
                                 train_history, args.test_path,
                                 args.label_prefix)
                else:
                    get_accuracy(current_log_dir, train_params,
                                 train_history_path, hyperparams_hashed,
                                 train_history, train_path, args.label_prefix)

                print("The model is stored at {}".format(current_log_dir))
                exit()
            else:
                print("Forced retraining")
                print("Training hyper-parameters hashed: {}".format(
                    hyperparams_hashed))
        else:
            print("Training hyper-parameters hashed: {}".format(
                hyperparams_hashed))
    else:
        train_history = dict()

    clean_directory(current_log_dir)

    max_words_with_ng = get_max_words_with_ngrams(max_words, args.word_ngrams)

    print("Preparing dataset")
    print("Total number of datapoints: {}".format(len(train_descriptions)))
    print("Max number of words in description: {}".format(max_words))
    print("Max number of words with n-grams in description: {}".format(
        max_words_with_ng))

    word_vocab, label_vocab = get_word_label_vocabs(train_descriptions,
                                                    train_labels,
                                                    args.word_ngrams,
                                                    args.min_word_count,
                                                    sort_ngrams,
                                                    cache_dir,
                                                    bool(args.force),
                                                    show_progress=progress_bar,
                                                    flush=flush)

    with open(os.path.join(current_log_dir, "model_params.json"),
              "w+") as outfile:
        json.dump(model_params, outfile)

    num_words_in_train = len(word_vocab)
    train_description_hashes, train_labels, cache = \
        cache_data(train_descriptions, train_labels, word_vocab, label_vocab, args.word_ngrams, sort_ngrams,
                   show_progress=progress_bar, progress_desc="Cache train descriptions", flush=flush)
    del train_descriptions

    test_description_hashes, test_labels = [], []
    initial_test_len = 0
    if use_test:
        test_descriptions, test_labels, max_words_test = parse_txt(
            args.test_path,
            as_tokens=True,
            return_max_len=True,
            label_prefix=args.label_prefix)
        initial_test_len = len(test_descriptions)

        print("Total number of test datapoints: {}".format(
            len(test_descriptions)))
        test_description_hashes, test_labels, cache = \
            cache_data(test_descriptions, test_labels, word_vocab, label_vocab, args.word_ngrams, sort_ngrams,
                       cache=cache, is_test_data=True, show_progress=progress_bar,
                       progress_desc="Cache test descriptions", flush=flush)
        del test_descriptions

    data = {
        "train_description_hashes": train_description_hashes,
        "train_labels": train_labels,
        "test_description_hashes": test_description_hashes,
        "test_labels": test_labels,
        "cache": cache,
        "label_vocab": label_vocab,
        "num_words_in_train": num_words_in_train,
        "test_path": args.test_path,
        "initial_test_len": initial_test_len,
    }

    run_train(data, train_specific, train_params, data_specific, train_history,
              train_history_path)
    print("All process took {} seconds".format(
        round(time.time() - main_start, 0)),
          flush=flush)