Exemplo n.º 1
0
def load_vocabulary():
    if os.path.exists(CKPT_PATH + config['TRAIN']['VOCABULARY']):
        word2index = {}
        with open(CKPT_PATH + config['TRAIN']['VOCABULARY']) as file:
            for line in file:
                line_spl = line[:-1].split()
                word2index[line_spl[0]] = int(line_spl[1])
        index2word = dict(zip(word2index.values(), word2index.keys()))
        vocab = Vocabulary()
        vocab.word2index = word2index
        vocab.index2word = index2word
        return vocab
    else:
        raise ('not found %s' % CKPT_PATH + config['TRAIN']['VOCABULARY'])
Exemplo n.º 2
0
def load_vocabulary():
    if os.path.exists(config.vocabulary_path):
        word2index = {}
        with open(config.vocabulary_path) as file:
            for line in file:
                line_spl = line[:-1].split()
                word2index[line_spl[0]] = int(line_spl[1])
        index2word = dict(zip(word2index.values(), word2index.keys()))
        vocab = Vocabulary()
        vocab.word2index = word2index
        vocab.index2word = index2word
        return vocab
    else:
        raise ('not found %s' % config.vocabulary_path)
Exemplo n.º 3
0
def main(_):

    vocabulary = Vocabulary()
    vocabulary.load_vocab(FLAGS.vocab_file)

    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = LSTMModel(vocabulary.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = vocabulary.encode(FLAGS.start_string)
    arr = model.predict(FLAGS.max_length, start, vocabulary.vocab_size)
    print(vocabulary.decode(arr))
Exemplo n.º 4
0
def main(config, local):
    n_gpu = int(GPU_NUM)
    n_gpu = 1 if n_gpu == 0 else n_gpu
    np.random.seed(config.random_seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.random_seed)

    # Create data instances
    vocab = Vocabulary(config.vocab_path)

    if config.mode == 'train':
        # Prepare train data loader
        train_dataset, val_dataset = Dataset(vocab), Dataset(vocab)
        train_path = os.path.join(config.data_dir, 'train_data/train_data')
        val_path = os.path.join(config.data_dir, 'train_data/val_data')

        train_dataset.create_instances(train_path,
                                       config.max_seq_length,
                                       type='train')
        val_dataset.create_instances(val_path,
                                     config.max_seq_length,
                                     type='val')

        train_loader = DataLoader(train_dataset,
                                  batch_size=config.batch_size * n_gpu,
                                  shuffle=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=config.batch_size * n_gpu)
    else:
        train_loader, val_loader = None, None

    trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader)

    if nsml.IS_ON_NSML:
        bind_model(trainer.model, vocab, config)

        if config.pause:
            nsml.paused(scope=local)

    if config.mode == 'train':
        trainer.train()
Exemplo n.º 5
0
def main(_):
    if os.path.exists(checkpoint_path) is False:
        os.makedirs(checkpoint_path)

    # 读取训练文本
    with open(datafile, 'r', encoding='utf-8') as f:
        train_data = f.read()

    # 加载/生成 词典
    vocabulary = Vocabulary()
    if FLAGS.vocab_file:
        vocabulary.load_vocab(FLAGS.vocab_file)
    else:
        vocabulary.build_vocab(train_data)
    vocabulary.save(FLAGS.vocab_file)

    input_ids = vocabulary.encode(train_data)

    g = batch_generator(input_ids, FLAGS.batch_size, FLAGS.num_steps)

    model = LSTMModel(vocabulary.vocab_size,
                      batch_size=FLAGS.batch_size,
                      num_steps=FLAGS.num_steps,
                      lstm_size=FLAGS.lstm_size,
                      num_layers=FLAGS.num_layers,
                      learning_rate=FLAGS.learning_rate,
                      train_keep_prob=FLAGS.train_keep_prob,
                      use_embedding=FLAGS.use_embedding,
                      embedding_size=FLAGS.embedding_size)
    model.train(
        g,
        FLAGS.max_steps,
        checkpoint_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
Exemplo n.º 6
0
def load_data(config, vocab=None):
    test_df = pd.read_csv(config.test_file,
                          header=0,
                          names=['face_id', 'content', 'label'])

    test_data, test_label, test_num_sent, test_num_word = build_data(
        test_df['content'], test_df['label'])

    if vocab is None:
        vocab = Vocabulary()
        [[vocab.add_sentence(x, y) for (x, y) in zip(data, test_label)]
         for data in test_data]

    test_input = [[[vocab.word_to_id(word) for word in sent] for sent in doc]
                  for doc in test_data]
    test_label = [vocab.tag_to_id(label) for label in test_label]
    test_input = pad_sequence(test_input, True, config.max_sent,
                              config.max_word)
    # t = torch.tensor(test_input)
    # print(t.size())
    # print(test_label)
    test_dataset = myDataset(test_input, test_label)

    return test_dataset, vocab
Exemplo n.º 7
0
subset = subset_df(df, n_samples=n_subset)

# Create train, val, test sets
train, validation, test = split_df(subset,
                                   size_train=train_size,
                                   size_valtest=valtest_size)

# Compute main target class weights
target_weights = class_weights(train, target='overall', p_expect=(1 / 3))
np.savetxt("train_class_weights.csv", target_weights, delimiter=",")

# Compute conditional independent sample weights
train = sample_weights(train)

# Create Vocab on train set
vocab = Vocabulary(freq_threshold=5)
wordidx, idxword = vocab.build_vocab(train['reviewText'].tolist())

# Save train, val, test sets
train.to_csv(save_train, index=False)
validation.to_csv(save_val, index=False)
test.to_csv(save_test, index=False)

# Save wordidx and idxword
with open(save_wordidx, 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in wordidx.items():
        writer.writerow([key, value])

with open(save_idxword, 'w') as csv_file:
    writer = csv.writer(csv_file)