Exemplo n.º 1
0
def load_datasets(load_existing_dump=False):
    model_config = ModelConfig()

    data_reader = DataReader()
    train_lines = open(
        os.path.join(DataConfig.data_dir_path, DataConfig.train_path),
        "r").readlines()
    valid_lines = open(
        os.path.join(DataConfig.data_dir_path, DataConfig.valid_path),
        "r").readlines()
    test_lines = open(
        os.path.join(DataConfig.data_dir_path, DataConfig.test_path),
        "r").readlines()

    # Load data
    train_data = data_reader.read_data(train_lines)
    print("Loaded Train data")
    valid_data = data_reader.read_data(valid_lines)
    print("Loaded Dev data")
    test_data = data_reader.read_data(test_lines)
    print("Loaded Test data")

    feature_extractor = FeatureExtractor(model_config)
    dataset = Dataset(model_config, train_data, valid_data, test_data,
                      feature_extractor)

    # Vocab processing
    if load_existing_dump:
        dataset.word2idx = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file))
        dataset.idx2word = {
            idx: word
            for (word, idx) in dataset.word2idx.items()
        }
        dataset.pos2idx = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.pos_vocab_file))
        dataset.idx2pos = {idx: pos for (pos, idx) in dataset.pos2idx.items()}
        dataset.dep2idx = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.dep_vocab_file))
        dataset.idx2dep = {idx: dep for (dep, idx) in dataset.dep2idx.items()}

        dataset.model_config.load_existing_vocab = True
        print "loaded existing Vocab!"
        dataset.word_embedding_matrix = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file))
        dataset.pos_embedding_matrix = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.pos_emb_file))
        dataset.dep_embedding_matrix = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.dep_emb_file))
        print "loaded existing embedding matrix!"

    else:
        dataset.build_vocab()
        dump_pickle(
            dataset.word2idx,
            os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file))
        dump_pickle(
            dataset.pos2idx,
            os.path.join(DataConfig.dump_dir, DataConfig.pos_vocab_file))
        dump_pickle(
            dataset.dep2idx,
            os.path.join(DataConfig.dump_dir, DataConfig.dep_vocab_file))
        dataset.model_config.load_existing_vocab = True
        print "Vocab Build Done!"
        dataset.build_embedding_matrix()
        print "embedding matrix Build Done"
        dump_pickle(
            dataset.word_embedding_matrix,
            os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file))
        dump_pickle(dataset.pos_embedding_matrix,
                    os.path.join(DataConfig.dump_dir, DataConfig.pos_emb_file))
        dump_pickle(dataset.dep_embedding_matrix,
                    os.path.join(DataConfig.dump_dir, DataConfig.dep_emb_file))

    print "converting data into ids.."
    dataset.convert_data_to_ids()
    print "Done!"
    dataset.model_config.word_features_types = len(dataset.train_inputs[0][0])
    dataset.model_config.pos_features_types = len(dataset.train_inputs[1][0])
    dataset.model_config.dep_features_types = len(dataset.train_inputs[2][0])
    dataset.model_config.num_features_types = dataset.model_config.word_features_types + \
                                              dataset.model_config.pos_features_types + dataset.model_config.dep_features_types
    dataset.model_config.num_classes = len(dataset.train_targets[0])

    return dataset
Exemplo n.º 2
0
def load_datasets(load_existing_dump=False):
    model_config = ModelConfig()

    data_reader = DataReader()
    train_lines = open(
        os.path.join(DataConfig.data_dir_path, DataConfig.train_path),
        "r").readlines()
    valid_lines = open(
        os.path.join(DataConfig.data_dir_path, DataConfig.valid_path),
        "r").readlines()
    test_lines = open(
        os.path.join(DataConfig.data_dir_path, DataConfig.test_path),
        "r").readlines()

    # Load data
    train_data_obj = data_reader.read_data(train_lines)
    print("Loaded Train data")
    valid_data_obj = data_reader.read_data(valid_lines)
    print("Loaded Dev data")
    test_data_obj = data_reader.read_data(test_lines)
    print("Loaded Test data")

    feature_extractor = FeatureExtractor(model_config)
    dataset = Dataset(model_config, train_data_obj, valid_data_obj,
                      test_data_obj, feature_extractor)
    dataset.model_config.max_seq_len = dataset.get_max_seq_len()
    dataset.model_config.max_word_len = dataset.get_max_word_len()

    # Vocab processing
    if load_existing_dump:
        dataset.word2idx = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file))
        dataset.idx2word = {
            idx: word
            for (word, idx) in dataset.word2idx.items()
        }

        dataset.char2idx = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.char_vocab_file))
        dataset.idx2char = {
            idx: char
            for (char, idx) in dataset.char2idx.items()
        }

        dataset.label2idx = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.label_vocab_file))
        dataset.idx2label = {
            idx: label
            for (label, idx) in dataset.label2idx.items()
        }

        dataset.model_config.load_existing_vocab = True
        print "loaded existing Vocab!"

        dataset.word_embedding_matrix = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file))
        dataset.char_embedding_matrix = get_pickle(
            os.path.join(DataConfig.dump_dir, DataConfig.char_emb_file))
        print "loaded existing embedding matrix!"

    else:
        dataset.build_vocab()
        dump_pickle(
            dataset.word2idx,
            os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file))
        dump_pickle(
            dataset.char2idx,
            os.path.join(DataConfig.dump_dir, DataConfig.char_vocab_file))
        dump_pickle(
            dataset.label2idx,
            os.path.join(DataConfig.dump_dir, DataConfig.label_vocab_file))

        dataset.model_config.load_existing_vocab = True
        print "Vocab Build Done!"
        dataset.build_embedding_matrix()
        print "embedding matrix Build Done"
        dump_pickle(
            dataset.word_embedding_matrix,
            os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file))
        dump_pickle(
            dataset.char_embedding_matrix,
            os.path.join(DataConfig.dump_dir, DataConfig.char_emb_file))

    dataset.model_config.num_classes = len(dataset.label2idx)

    print "converting data into ids.."
    dataset.convert_data_to_ids()
    print "Done!"

    return dataset