def __init__(self, source_vocab_file, target_vocab_file, sample_file,
              config, logger):
     self.logger = logger
     self.config = config
     self.sample_file = sample_file
     self.word2id_src, self.id2word_src = load_vocabulary(source_vocab_file)
     self.word2id_tar, self.id2word_tar = load_vocabulary(target_vocab_file)
     self.tf_record_file = os.path.join(self.config.tokenized_data_dir,
                                        'sample.tf_record')
     self.pad_id_src = self.word2id_src['<pad>']
     self.unk_id_src = self.word2id_src['<unk>']
     self.pad_id_tar = self.word2id_tar['<pad>']
     self.unk_id_tar = self.word2id_tar['<unk>']
示例#2
0
    def __load_embeddings(self):
        '''Loads the embeddings with the associated vocabulary
           and saves them for later usage in the DataLoader and
           while training/testing.'''
        vocabulary = None
        embeddings = None

        vocabulary = utils.load_vocabulary(self.cfg.get('vocabulary'))

        if self.cfg.get('w2v_embeddings'):
            embeddings = utils.load_w2v_embeddings(
                self.cfg.get('w2v_embeddings'))
        elif self.cfg.get('ft_embeddings'):
            embeddingsy = utils.load_ft_embeddings(
                self.cfg.get('ft_embeddings'))
        else:
            embeddings = np.random.uniform(
                -1.0,
                1.0,
                size=(len(vocabulary),
                      self.cfg.get('max_random_embeddings_size')))

        # Prepare the vocabulary and embeddings (e.g. add embedding for unknown words)
        embeddings, vocabulary = utils.prepare_embeddings_and_vocabulary(
            embeddings, vocabulary)

        self.cfg.set('vocabulary_dict', vocabulary)
        self.cfg.set('embeddings_matrix', embeddings)

        # revert the vocabulary for the idx -> text usages
        self.rev_vocabulary = utils.reverse_vocabulary(vocabulary)
示例#3
0
 def __init__(self,vocab_file,sample_file,config,logger):
     self.logger=logger
     self.config=config
     self.sample_file=sample_file
     self.word2id,self.id2word=load_vocabulary(vocab_file)
     self.tf_record_file=os.path.join(self.config.tokenized_data_dir,'sample.tf_record')
     self.pad_id=self.word2id['<pad>']
     self.unk_id=self.word2id['<unk>']
示例#4
0
 def __init__(self, parameters):
     """
     Constructor. It loads the hyperparameters of the network
     Parameters
     ---------
     parameters: str
         File containing the parameters of the network
     """
     self.parameters = self.load_parameters(parameters)
     self.session_conf = tf.ConfigProto(
         allow_soft_placement=self.parameters['allow_soft_placement'],
         log_device_placement=self.parameters['log_device_placement'])
     self.vocabulary_dict = load_vocabulary(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      self.parameters['vocabulary']))
     self.inverse_vocabulary_dict = load_vocabulary(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      self.parameters['inverse_vocabulary']))
示例#5
0
文件: test.py 项目: yh157612/ADL2016
def main(_):
    vocab, dictionary = load_vocabulary(os.path.join(FLAGS.vocab_dir, 'sentence_vocab'))
    tags_list, tags_dict = load_vocabulary(os.path.join(FLAGS.vocab_dir, 'tag_vocab'))
    intent_list, intent_dict = load_vocabulary(os.path.join(FLAGS.vocab_dir, 'intent_vocab'))
    all_sentence = prepare_test_data(FLAGS.test_data_file, dictionary)

    model = RNNModel(hidden_size=FLAGS.hidden_size,
                     embed_size=FLAGS.embedding_size,
                     source_vocab_size=len(vocab),
                     tag_vocab_size=len(tags_list),
                     intent_vocab_size=len(intent_list))

    all_tags = []
    all_intent = []
    with tf.Session(graph=model.graph) as sess:
        # saver = tf.train.import_meta_graph('{}.meta'.format(FLAGS.checkpoint_file))
        model.saver.restore(sess, FLAGS.checkpoint_file)

        # graph = tf.get_default_graph()
        # input_x = graph.get_tensor_by_name('input_x:0')
        # input_len = graph.get_tensor_by_name('input_len:0')
        # keep_prob = graph.get_tensor_by_name('keep_prob:0')
        # output_tag = graph.get_tensor_by_name('output_tag:0')
        # output_intent = graph.get_tensor_by_name('output_intent:0')

        for sentence in all_sentence:
            predict_tags, predict_intent = sess.run([model.output_tag, model.output_intent], feed_dict={
                model.input_x: [sentence],
                model.input_len: [len(sentence)],
                model.keep_prob: 1.0
            })
            all_tags.append(predict_tags[0])
            all_intent.append(predict_intent[0])

    all_tags = [['O'] + [tags_list[i] for i in tags] for tags in all_tags]
    all_intent = [intent_list[i] for i in all_intent]
    with open(FLAGS.output_tag_file, 'w') as f:
        f.write('\n'.join([' '.join(tags) for tags in all_tags]))
    with open(FLAGS.output_intent_file, 'w') as f:
        f.write('\n'.join(all_intent))
示例#6
0
def translate_to_origin(model, text, language):
    text_matrix = utils.text_to_one_hot_matrix(text, language)
    probas = model.predict(text_matrix)
    probas = probas[0]

    indices = []
    for proba in probas:
        ind = np.where(proba == np.amax(max(proba)))
        indices.append(ind[0][0])

    char_to_index, index_to_char, vocab_size, trans_to_index, _, trans_vocab_size = utils.load_vocabulary(
        language)

    translated_text_list = [index_to_char[i] for i in indices]
    translated_text = ''.join(translated_text_list)
    return translated_text, translated_text_list
示例#7
0
class Hyperparamters:
    # Train parameters
    num_train_epochs = 20
    print_step = 1
    batch_size = 8
    summary_step = 10
    num_saved_per_epoch = 3
    max_to_keep = 100
    logdir = 'logdir/model_01'
    file_save_model = 'model/model_01'

    # Predict model file
    file_model = 'model/saved_01'

    # Train/Test data
    data_dir = os.path.join(pwd, 'data')
    train_data = 'train_onehot.csv'
    test_data = 'test_onehot.csv'

    # Load vocabulcary dict
    dict_id2label, dict_label2id = load_vocabulary(
        os.path.join(pwd, 'data', 'vocabulary_label.txt'))
    label_vocabulary = list(dict_id2label.values())

    # Optimization parameters
    warmup_proportion = 0.1
    use_tpu = None
    do_lower_case = True
    learning_rate = 5e-5

    # TextCNN parameters
    # num_filters = 128
    # filter_sizes = [2,3,4,5,6,7]
    # embedding_size = 384
    # keep_prob = 0.5

    # Sequence and Label
    sequence_length = 60
    num_labels = len(list(dict_id2label))

    # ALBERT
    model = 'albert_small_zh_google'
    bert_path = os.path.join(pwd, model)
    vocab_file = os.path.join(pwd, model, 'vocab_chinese.txt')
    init_checkpoint = os.path.join(pwd, model, 'albert_model.ckpt')
    saved_model_path = os.path.join(pwd, 'model')
示例#8
0
def main():
    args = cmdparser()
    config = get_config(args.config)
    if args.preprocess:
        utils.preprocess(config['raw_path'], config['train_path'],
                         config['dev_path'], config['label_path'],
                         config['stop_word_path'], config['vocabulary_path'])
    labels = utils.load_labels(config['label_path'])
    vocabulary = utils.load_vocabulary(config['vocabulary_path'])
    stop_words = utils.load_stop_words(config['stop_word_path'])

    if args.dev:
        train(config, vocabulary, labels, stop_words, save_path='', mode='dev')
    elif args.train:
        if int(config['ensemble_size']) == 1:
            train(config,
                  vocabulary,
                  labels,
                  stop_words,
                  save_path=config['model_path'],
                  mode='train')
        else:
            for i in range(int(config['ensemble_size'])):
                train(config,
                      vocabulary,
                      labels,
                      stop_words,
                      save_path=config[f'model_path_{i+1}'],
                      mode='train')
    elif args.test:
        if int(config['ensemble_size']) == 1:
            test(config,
                 vocabulary,
                 labels,
                 stop_words,
                 save_path=[config['model_path']])
        else:
            test_paths = [
                config[f'model_path_{i+1}']
                for i in range(int(config['ensemble_size']))
            ]
            test(config, vocabulary, labels, stop_words, save_path=test_paths)
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--hdim', default=512, type=int)
    parser.add_argument('--seq_len', default=40, type=int)
    parser.add_argument('--model', default=None)
    parser.add_argument('--depth', default=1, type=int)
    parser.add_argument('--translit_path', default=None)
    parser.add_argument('--language', default=None)

    args = parser.parse_args()

    print("Loading Files")
    (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans,
     trans_vocab_size) = utils.load_vocabulary(language=args.language)
    (test_text, trans, long_letter_reverse_mapping) = utils.load_language_data(
        language=args.language, is_train=False)
    print("Building network ...")
    (output_layer,
     predict) = utils.define_model(args.hdim,
                                   args.depth,
                                   trans_vocab_size=trans_vocab_size,
                                   vocab_size=vocab_size,
                                   is_train=False)

    if args.model:
        f = np.load(args.model)
        param_values = [np.float32(f[i]) for i in range(len(f))]
        lasagne.layers.set_all_param_values(output_layer, param_values)
    print("Testing ...")

    if args.translit_path:
        data = codecs.open(args.translit_path, 'r', encoding='utf-8').read()
        translate_romanized(predict, data, args.seq_len, trans,
                            trans_vocab_size, trans_to_index, index_to_char,
                            long_letter_reverse_mapping)

    else:
        test(predict, test_text, args.language, args.model, args.seq_len,
             long_letter_reverse_mapping, trans, trans_to_index, char_to_index,
             index_to_trans, index_to_char)
示例#10
0
from tqdm import tqdm
from random_word import RandomWords

from utils import load_vocabulary, guess_word

vocabularies = {
    "small": load_vocabulary("american-english-small"),
    "normal": load_vocabulary("american-english"),
    "large": load_vocabulary("american-english-large"),
    "insane": load_vocabulary("american-english-insane"),
    "english_normal": load_vocabulary("english_words.txt"),
    "english_large": load_vocabulary("english_words_complete.txt"),
}

for key1, voc1 in vocabularies.items():
    for key2, voc2 in vocabularies.items():
        if key2 == key1: continue
        diff1 = set(voc1).difference(set(voc2))
        diff2 = set(voc2).difference(set(voc1))
        sum_ = set(voc1).union(set(voc2))

        print()
        print(key1, " >>> ", len(voc1))
        print(key2, " >>> ", len(voc2))
        print(key1 + " - " + key2, " >>> ", len(diff1))
        print(key2 + " - " + key1, " >>> ", len(diff2))
        print(key1 + " + " + key2, " >>> ", len(sum_))

a = b

for key, vocab in vocabularies.items():
示例#11
0
                    help="entailment/neutral/contradiction")

# data
parser.add_argument("--word_emb_dim",
                    type=int,
                    default=300,
                    help="word embedding dimension")
"""
CONFIGURATIONS
"""
config = parser.parse_args()
"""
DATA
"""
train, dev, test = ed.import_datasets(config.nli_path)
word_vectors = ut.load_vocabulary(config.vocabulary_path)

for sentence_type in ['premise', 'hypothesis']:
    for data_type in ['train', 'dev']:
        eval(data_type)[sentence_type] = np.array(
            [['<s>'] + [word
                        for word in sent.split() if word in word_vectors] +
             ['</s>'] for sent in eval(data_type)[sentence_type]])
"""
MODEL
"""
# model configurations
config_nli_model = {
    'word_emb_dim': config.word_emb_dim,
    'bilstm_dim': config.bilstm_dim,
    'lstm_layers': config.lstm_layers,
示例#12
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description='''
    Train you model specifying parameters
    
    Use-cases:
        python train.py --depth=10 --seq_len=30 --data_size=5_000 --languages=hy-en,hy-ru,ru-en
        python train.py --depth=10 --seq_len=30 --data_size=5_000 --languages=hy,ru-en,en
    
    ''')
    parser.add_argument('--hdim',
                        default=512,
                        type=int,
                        help='Dimension of hidden layers')
    parser.add_argument('--depth',
                        default=2,
                        type=int,
                        help='Depth of network.')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='Batch size for learning.')
    parser.add_argument('--seq_len',
                        default=100,
                        type=int,
                        help='Sequences size for splitting text for training.')
    parser.add_argument('--languages',
                        default=None,
                        required=True,
                        help='Specify language to train.')
    # parser.add_argument('--grad_clip', default=100, type=int, help='')
    # parser.add_argument('--lr', default=0.01, type=float, help='')
    parser.add_argument('--epoch',
                        default=10,
                        type=int,
                        help='Epochs of train.')
    # parser.add_argument('--model', default=None, help='')
    parser.add_argument('--prefix',
                        default='m',
                        help='Used for model name prefix.')
    # parser.add_argument('--start_from', default=0, type=float, help='')
    parser.add_argument(
        '--model_path',
        type=str,
        help=
        'Specify model path to save, or will we saved under languages/<lang>models/model_name_prefix***'
    )
    parser.add_argument(
        '--validate',
        type=bool,
        default=True,
        help='Evaluate percentage of validation data. Default:True')
    parser.add_argument(
        '--data_size',
        type=int,
        default=5_000_000,
        help='Split date size in chars: Set 0 to train all data.')

    args = parser.parse_args()

    languages = utils.parse_languages(args.languages)

    print("Languages to train: " + str(languages))

    list_languages = []
    for key, value in languages.items():
        list_languages.append(key)
        if len(value) == 0:
            dirs = glob.glob('data_preprocessed/' + key + "/mapping_to_*")
            print(dirs)
            for dir in dirs:
                p = re.compile("mapping_to_(.*)")
                result = p.search(dir)
                list_languages.append(result.group(1))
        else:
            list_languages.extend(value)

    list_languages = list(dict.fromkeys(list_languages))
    list_languages.sort()

    print(list_languages)

    print("Loading Files")

    char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size = \
                                                            utils.load_vocabulary(list_languages)

    print("vocab size: ", vocab_size)
    print("trans vocab size: ", trans_vocab_size)

    train_text, train_translated_text = utils.load_preprocessed_data(
        languages, args.data_size, 'train')

    print("Train text size:", len(train_text))
    print("Train translated text size:", len(train_translated_text))

    print(char_to_index)

    print('а' in char_to_index)
    print(ord('а'))

    x_train = utils.generator_biniries(train_text, args.seq_len, char_to_index)

    return

    # shuffle train data
    train_text = train_text.split('։')
    random.shuffle(train_text)
    train_text = '։'.join(train_text)

    if args.data_size != 0:
        val_size = round(args.data_size / 0.7 * 0.3)
        print("Data splitted, train:", args.data_size, ", val:", val_size)
        train_text = train_text[:args.data_size]  # 226_849_593
        val_text = val_text[:val_size]  #34722649

    import utilsk

    print("Building Network ...")

    model = utilsk.define_model(args.hdim,
                                args.depth,
                                trans_vocab_size,
                                vocab_size,
                                is_train=True)
    print(model.summary())

    print("Preparing data ...")
    before_fit_time = datetime.now()
    (x_train, y_train) = utils.data_generator(train_text,
                                              args.seq_len,
                                              trans,
                                              trans_to_index,
                                              char_to_index,
                                              is_train=True)

    print("Training ...")
    history = model.fit(x_train,
                        y_train,
                        validation_split=0.1,
                        epochs=args.epoch,
                        batch_size=args.batch_size)
    loss = history.history["loss"][-1]
    print(history.history)

    # save model
    model_file_path = utils.get_model_file_path(args, before_fit_time, loss)
    model.save_weights(model_file_path)
    print('Model saved:', model_file_path)

    print("Validate exact....")
    if args.validate:
        (x_test, y_test) = utils.data_generator(val_text,
                                                args.seq_len,
                                                trans,
                                                trans_to_index,
                                                char_to_index,
                                                is_train=True)
        score = model.evaluate(x_test, y_test, verbose=1)
        print("Evaluated on validation data", score)
    else:
        print("Validation disabled.")

    utils.save_acc_loss_results(args, history)
    utils.write_results_file(args, history, train_text, val_text)
示例#13
0
def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_intent_total = 0
    loss_slot_total = 0
    predict_slot_all = np.array([], dtype=int)
    predict_intent_all = np.array([], dtype=int)
    labels_slot_all = np.array([], dtype=int)
    labels_intent_all = np.array([], dtype=int)
    with torch.no_grad():
        i = 0
        for texts, labels, slot in data_iter:
            # print(i)
            if texts[0].shape[0] == 0 or labels.shape[0] == 0:
                continue
            outputs = model(texts)
            slot_outputs = outputs[0]
            intent_outputs = outputs[1]
            slot = slot.view(-1)
            # loss_intent = F.multi_margin_loss(intent_outputs, labels)
            intent_1 = torch.max(intent_outputs, dim=-1, keepdim=False)[0].cuda()
            loss_intent = F.cross_entropy(intent_1, labels)
            loss_slot = F.cross_entropy(slot_outputs, slot)
            loss_slot_total += loss_slot
            loss_intent_total += loss_intent
            labels = labels.data.cpu().numpy()
            slot = slot.data.cpu().numpy()
            predict_intent = torch.max(intent_1.data, 1)[1].cpu()
            predict_slot = torch.max(slot_outputs.data, 1)[1].cpu()
            labels_intent_all = np.append(labels_intent_all, labels)
            labels_slot_all = np.append(labels_slot_all, slot)
            predict_intent_all = np.append(predict_intent_all, predict_intent)
            predict_slot_all = np.append(predict_slot_all, predict_slot)
            i += 1
    acc_intent = metrics.accuracy_score(labels_intent_all, predict_intent_all)
    new_labels_slot_all = []
    new_predict_slot_all = []
    for a, b in zip(labels_slot_all, predict_slot_all):
        if a == b and a == 72:
            continue
        else:
            new_labels_slot_all.append(a)
            new_predict_slot_all.append(b)
    new_labels_slot_all = np.array(new_labels_slot_all)
    new_predict_slot_all = np.array(new_predict_slot_all)
    acc_slot = metrics.accuracy_score(new_labels_slot_all, new_predict_slot_all)
    if test:
        import os
        from utils import load_vocabulary
        # slot_vocab = load_vocabulary(os.path.join(config.vocab_path, 'test_slot_vocab'))
        # slot_vocab['rev'] = slot_vocab['rev'][0:72]
        intent_vocab = load_vocabulary(os.path.join(config.vocab_path, 'intent_vocab'))
        report_intent = metrics.classification_report(labels_intent_all, predict_intent_all,
                                                      target_names=intent_vocab['rev'], digits=4)
        # report_slot = metrics.classification_report(new_labels_slot_all, new_predict_slot_all,
        #                                             target_names=slot_vocab['rev'], digits=4)
        # print(report_slot)
        confusion_intent = metrics.confusion_matrix(labels_intent_all, predict_intent_all)
        confusion_slot = metrics.confusion_matrix(new_labels_slot_all, new_predict_slot_all)
        return acc_intent, loss_intent_total / len(data_iter), report_intent, confusion_intent, loss_slot_total / len(
            data_iter), acc_slot,  confusion_slot
    return acc_intent, loss_intent_total / len(data_iter), acc_slot, loss_slot_total / len(data_iter)
示例#14
0
def main():
    parser = argparse.ArgumentParser(description='Attention-based NMT')
    parser.add_argument('SOURCE_VOCAB', help='source vocabulary file')
    parser.add_argument('TARGET_VOCAB', help='target vocabulary file')
    parser.add_argument('model_npz', help='model file')
    parser.add_argument('--validation-source',
                        help='source sentence list for validation')
    parser.add_argument('--validation-target',
                        help='target sentence list for validation')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=128,
                        help='number of sentence pairs in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='resume the training from snapshot')
    parser.add_argument('--encoder-unit',
                        type=int,
                        default=128,
                        help='number of units')
    parser.add_argument('--encoder-layer',
                        type=int,
                        default=3,
                        help='number of layers')
    parser.add_argument('--encoder-dropout',
                        type=int,
                        default=0.1,
                        help='number of layers')
    parser.add_argument('--decoder-unit',
                        type=int,
                        default=128,
                        help='number of units')
    parser.add_argument('--attention-unit',
                        type=int,
                        default=128,
                        help='number of units')
    parser.add_argument('--maxout-unit',
                        type=int,
                        default=128,
                        help='number of units')
    parser.add_argument('--min-source-sentence',
                        type=int,
                        default=1,
                        help='minimium length of source sentence')
    parser.add_argument('--max-source-sentence',
                        type=int,
                        default=50,
                        help='maximum length of source sentence')
    parser.add_argument('--log-interval',
                        type=int,
                        default=200,
                        help='number of iteration to show log')
    parser.add_argument('--validation-interval',
                        type=int,
                        default=4000,
                        help='number of iteration to evlauate the model '
                        'with validation dataset')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='directory to output the result')
    parser.add_argument('--debug',
                        action='store_true',
                        help='use a small part of training data')
    args = parser.parse_args()

    source_ids = load_vocabulary(args.SOURCE_VOCAB)
    target_ids = load_vocabulary(args.TARGET_VOCAB)

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    model = Seq2seq(len(source_ids), len(target_ids), args.encoder_layer,
                    args.encoder_unit, args.encoder_dropout, args.decoder_unit,
                    args.attention_unit, args.maxout_unit)
    chainer.serializers.load_npz(args.model_npz, model)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    m = MeCab('-Owakati')
    while True:
        line = input('> ')
        words = m.parse(line).split()
        words.append('<EOS>')
        x = np.zeros((1, len(words)), dtype=np.int32)
        for i in range(len(words)):
            x[0, i] = source_ids.get(words[i], UNK)
        result = model.translate(x)
        o_words = []
        for i in range(len(result[0])):
            o_words.append(target_words.get(result[0][i], '<unk>'))
            if o_words[-1] == '<EOS>':
                o_words.pop()
                break
        print(" ".join(o_words))
示例#15
0
img_embedding_size = 4096

paths = {
    "ckpt": "./ckpt/mae.ckpt.batch2500",
    "test_data": "./data/test",
    "vocab_word": "./data/vocab_word.txt",
    "vocab_attr": "./data/vocab_attr.txt",
    "vocab_value": "./data/vocab_value.txt",
    "image_vector": "./data/image_fc_vectors.npy"
}

use_image = False

print("load data...")

w2i_word, i2w_word = load_vocabulary(paths["vocab_word"])
w2i_attr, i2w_attr = load_vocabulary(paths["vocab_attr"])
w2i_value, i2w_value = load_vocabulary(paths["vocab_value"])

data_processor = DataProcessor(
    paths["test_data"] + "/input.seq",
    paths["test_data"] + "/input.imageindex",
    paths["test_data"] + "/input.attr",
    paths["test_data"] + "/output.value",
    w2i_word,
    w2i_attr, 
    w2i_value, 
    shuffling=False
)

if use_image:
示例#16
0
    outputs = np.array(outputs, dtype=np.int16).reshape((len(inputs), len(punctuations)))

    f = h5py.File(output_path + '.h5', "w")
    dset = f.create_dataset('inputs', data=inputs, dtype='i8')
    dset = f.create_dataset('outputs',data=outputs, dtype='i8')

    data = {"vocabulary": vocabulary, "punctuations": punctuations, 
           "total_size": len(inputs)}
    
    with open(output_path + '.pkl', 'wb') as output_file:
        cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL)



PHASE1_TRAIN_PATH = "../data/train1"
PHASE1_DEV_PATH = "../data/dev1"
PUNCTUATIONS = {" ": 0, ".PERIOD": 1, ",COMMA": 2}
VOCABULARY_FILE = "../raw_data/vocab"
TRAIN_DATA = "../raw_data/train.txt"
DEV_DATA = "../raw_data/dev.txt"

if not os.path.exists("../data"):
  os.makedirs("../data")

print("Converting data...")

vocabulary = utils.load_vocabulary(VOCABULARY_FILE)

convert_files([TRAIN_DATA], vocabulary, PUNCTUATIONS, PHASE1_TRAIN_PATH)
convert_files([DEV_DATA], vocabulary, PUNCTUATIONS, PHASE1_DEV_PATH)
示例#17
0
def main(_):
  vocab = load_vocabulary(FLAGS.data_dir)
  data_reader = DataReader(FLAGS.data_dir)

  model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items,
                global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors,
                img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim,
                lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate)

  update_rating, update_review, global_step = train_fn(model)

  saver = tf.compat.v1.train.Saver(max_to_keep=10)

  log_file = open('log.txt', 'w')
  test_step = 0

  config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
  config.gpu_options.allow_growth = True
  with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(1, FLAGS.num_epochs + 1):
      log_info(log_file, "\nEpoch: {}/{}".format(epoch, FLAGS.num_epochs))

      count = 0
      sum_rating_loss = 0
      sum_review_loss = 0

      # Training
      for users, items, ratings in data_reader.read_train_set(FLAGS.batch_size, rating_only=True):
        count += 1

        fd = model.feed_dict(users=users, items=items, ratings=ratings, is_training=True)
        _step, _, _rating_loss = sess.run([global_step, update_rating, model.rating_loss], feed_dict=fd)
        sum_rating_loss += _rating_loss

        review_users, review_items, _, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                            data_reader.train_review)
        img_idx = [data_reader.train_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.train_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images,
                             reviews=reviews, is_training=True)
        _, _review_loss = sess.run([update_review, model.review_loss], feed_dict=fd)
        sum_review_loss += _review_loss

        if _step % FLAGS.display_step == 0:
          data_reader.iter.set_postfix(rating_loss=(sum_rating_loss / count),
                                       review_loss=(sum_review_loss / count))

      # Testing
      review_gen_corpus = defaultdict(list)
      review_ref_corpus = defaultdict(list)

      photo_bleu_scores = defaultdict(list)
      photo_rouge_scores = defaultdict(list)

      review_bleu_scores = defaultdict(list)
      review_rouge_scores = defaultdict(list)

      sess.run(model.init_metrics)
      for users, items, ratings in data_reader.read_test_set(FLAGS.batch_size, rating_only=True):
        test_step += 1

        fd = model.feed_dict(users, items, ratings)
        sess.run(model.update_metrics, feed_dict=fd)

        review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                                         data_reader.test_review)
        img_idx = [data_reader.test_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.test_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images)
        _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd)

        gen_reviews = decode_reviews(_reviews, vocab)
        ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews]

        for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews):
          review_gen_corpus[(user, item)].append(gen)
          review_ref_corpus[(user, item)] += refs

          bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            photo_bleu_scores[order].append(score)

          rouge_scores = rouge([gen], refs)
          for metric, score in rouge_scores.items():
            photo_rouge_scores[metric].append(score)

      _mae, _rmse = sess.run([model.mae, model.rmse])
      log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse))

      log_info(log_file, '\nReview generation results:')
      log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100,
        np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100))

      for user_item, gen_reviews in review_gen_corpus.items():
        references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])]

        user_item_bleu_scores = defaultdict(list)
        for gen in gen_reviews:
          bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            user_item_bleu_scores[order].append(score)
        for order, scores in user_item_bleu_scores.items():
          review_bleu_scores[order].append(np.array(scores).mean())

        user_item_rouge_scores = defaultdict(list)
        for gen in gen_reviews:
          rouge_scores = rouge([gen], references)
          for metric, score in rouge_scores.items():
            user_item_rouge_scores[metric].append(score)
        for metric, scores in user_item_rouge_scores.items():
          review_rouge_scores[metric].append(np.array(scores).mean())

      log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100,
        np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100))

      for metric in ['rouge_1', 'rouge_2', 'rouge_l']:
        log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
        log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))

      save_path = saver.save(sess, f"tmp/model{epoch}.ckpt")
      log_info(log_file, '')
示例#18
0
def main(_):
  vocab = load_vocabulary(FLAGS.data_dir)
  if FLAGS.generating:
    data_reader = DataReader(FLAGS.data_dir, n_reviews=5, generating=True)
  else:
    data_reader = DataReader(FLAGS.data_dir)
  model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items,
                global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors,
                img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim,
                lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate)

  saver = tf.compat.v1.train.Saver(max_to_keep=10)

  log_file = open('log.txt', 'w')
  test_step = 0

  config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
  config.gpu_options.allow_growth = True

  with tf.Session(config=config) as sess:
      saver.restore(sess, FLAGS.ckpt_dir)
      print('Model succesfully restored')
      # Testing
      review_gen_corpus = defaultdict(list)
      review_ref_corpus = defaultdict(list)

      photo_bleu_scores = defaultdict(list)
      photo_rouge_scores = defaultdict(list)

      review_bleu_scores = defaultdict(list)
      review_rouge_scores = defaultdict(list)

      sess.run(model.init_metrics)
      for users, items, ratings in data_reader.read_real_test_set(FLAGS.batch_size, rating_only=True):
        test_step += 1

        fd = model.feed_dict(users, items, ratings)
        sess.run(model.update_metrics, feed_dict=fd)

        review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                                         data_reader.real_test_review)
        img_idx = [data_reader.real_test_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.real_test_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images)
        _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd)

        gen_reviews = decode_reviews(_reviews, vocab)
        ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews]

        if FLAGS.generating:
          for gen, ref in zip(gen_reviews, ref_reviews):
            gen_str = "GENERATED:\n"+" ".join(gen)
            ref_str = "REFERENCE:\n"+" ".join([" ".join(sentence) for sentence in ref])+"\n"
            log_info(log_file,gen_str)
            log_info(log_file,ref_str)

        for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews):
          review_gen_corpus[(user, item)].append(gen)
          review_ref_corpus[(user, item)] += refs

          bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            photo_bleu_scores[order].append(score)

          rouge_scores = rouge([gen], refs)
          for metric, score in rouge_scores.items():
            photo_rouge_scores[metric].append(score)

      _mae, _rmse = sess.run([model.mae, model.rmse])
      log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse))

      log_info(log_file, '\nReview generation results:')
      log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100,
        np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100))

      for user_item, gen_reviews in review_gen_corpus.items():
        references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])]

        user_item_bleu_scores = defaultdict(list)
        for gen in gen_reviews:
          bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            user_item_bleu_scores[order].append(score)
        for order, scores in user_item_bleu_scores.items():
          review_bleu_scores[order].append(np.array(scores).mean())

        user_item_rouge_scores = defaultdict(list)
        for gen in gen_reviews:
          rouge_scores = rouge([gen], references)
          for metric, score in rouge_scores.items():
            user_item_rouge_scores[metric].append(score)
        for metric, scores in user_item_rouge_scores.items():
          review_rouge_scores[metric].append(np.array(scores).mean())

      log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100,
        np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100))

      for metric in ['rouge_1', 'rouge_2', 'rouge_l']:
        log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
        log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
示例#19
0
def convert_file(file_path, vocab_file, punct_file, output_path):
    punctuations = {" ":0, ".":1, ",":2}
    punctuations = utils.load_punctuations(punct_file)
    vocabulary = utils.load_vocabulary(vocab_file)
    punctuation = " "
    time_steps = 1 #to be used in future experiments
    
    filename = 'database' # output file name
    f = h5py.File(os.path.join(output_path, filename+'.h5'), "w")
    input_dset = f.create_dataset('inputs', (100, time_steps,len(vocabulary)), dtype='i8', maxshape=(None, time_steps, len(vocabulary)))
    output_dset = f.create_dataset('outputs', (100, len(punctuations)), dtype='i8', maxshape=(None, len(punctuations)))
    data_counter = 0
    with open(file_path, 'r') as corpus:
        for line in corpus:
            array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8)
            array[0,utils.input_word_index(vocabulary, "<START>")] = 1
            input_dset[data_counter] = array

            array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8)
            array[0,utils.punctuation_index(punctuations, " ")] = 1
            output_dset[data_counter] = array
            data_counter += 1
            if data_counter == input_dset.shape[0]:
                input_dset.resize(input_dset.shape[0]+1000, axis=0)
                output_dset.resize(output_dset.shape[0]+1000, axis=0)

            for token in line.split():
                if token in punctuations:
                    punctuation = token
                    continue
                else:
                    array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8)
                    array[0,utils.input_word_index(vocabulary, token)] = 1
                    input_dset[data_counter] = array

                    array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8)
                    array[0,utils.punctuation_index(punctuations, punctuation)] = 1
                    output_dset[data_counter] = array

                    punctuation = " "
                    data_counter += 1
                    if data_counter == input_dset.shape[0]:
                        input_dset.resize(input_dset.shape[0]+1000, axis=0)
                        output_dset.resize(output_dset.shape[0]+1000, axis=0)

            array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8)
            array[0,utils.input_word_index(vocabulary, "<END>")] = 1
            input_dset[data_counter] = array
         
            array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8)
            array[0,utils.punctuation_index(punctuations, punctuation)] = 1
            output_dset[data_counter] = array

            data_counter += 1
            if data_counter == input_dset.shape[0]:
                input_dset.resize(input_dset.shape[0]+1000, axis=0)
                output_dset.resize(output_dset.shape[0]+1000, axis=0)


    input_dset.resize(data_counter, axis=0)
    output_dset.resize(data_counter, axis=0)

    data = {"vocabulary": vocabulary, "punctuations": punctuations, 
           "total_size": data_counter}
    
    with open(os.path.join(output_path, filename+'.pkl'), 'wb') as output_file:
        cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Done!")
示例#20
0
def main():
    parser = argparse.ArgumentParser(description='CKBC')
    parser.add_argument('TRAIN', help='training dataset')
    parser.add_argument('CONCEPT_VOCAB', help='concept vocabulary')
    parser.add_argument('RELATION_VOCAB', help='relation vocabulary')
    parser.add_argument('--validation1',
                        help='validation dataset (1)')
    parser.add_argument('--validation2',
                        help='validation dataset (2)')
    parser.add_argument('--test',
                        help='test dataset')
    parser.add_argument('--batchsize', '-b', type=int, default=128,
                        help='number of sentence pairs in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--concept-unit', type=int, default=256,
                        help='number of concept units')
    parser.add_argument('--relation-unit', type=int, default=256,
                        help='number of relation units')
    parser.add_argument('--dropout', type=int, default=0.1,
                        help='number of layers')
    parser.add_argument('--log-interval', type=int, default=200,
                        help='number of iteration to show log')
    parser.add_argument('--embedding', default='',
                        help='path to pretrained word embedding')
    parser.add_argument('--finetune-embedding', action='store_true',
                        help='finetune pretrained embedding')
    parser.add_argument('--validation-interval', type=int, default=4000,
                        help='number of iteration to evlauate the model '
                        'with validation dataset')
    parser.add_argument('--out', '-o', default='result',
                        help='directory to output the result')
    parser.add_argument('--debug', action='store_true',
                        help='use a small part of training data')
    args = parser.parse_args()

    concept_ids = load_vocabulary(args.CONCEPT_VOCAB)
    relation_ids = load_vocabulary(args.RELATION_VOCAB)
    train_facts = load_data(
        concept_ids,
        relation_ids,
        args.TRAIN,
        debug=args.debug
    )
    train_data = [(h, r, t, y) for h, r, t, y in six.moves.zip(*train_facts)]

    train_head_unk = calculate_unknown_ratio(
        [h for h, _, _, _ in train_data]
    )
    train_relation_unk = calculate_unknown_ratio(
        [r for _, r, _, _ in train_data]
    )
    train_tail_unk = calculate_unknown_ratio(
        [t for _, _, t, _ in train_data]
    )

    embedding = load_embedding(args.embedding, concept_ids) \
        if args.embedding else None
    n_embed = embedding.shape[1] \
        if embedding is not None else args.concept_unit

    print('Concept vocabulary size: %d' % len(concept_ids))
    print('Relation vocabulary size: %d' % len(relation_ids))
    print('Train data size: %d' % len(train_data))
    print('Train head unknown: %.2f' % train_head_unk)
    print('Train relation unknown: %.2f' % train_relation_unk)
    print('Train tail unknown: %.2f' % train_tail_unk)
    if args.embedding:
        print('Pretrained word embedding: %s' % args.embedding)
        print('Fine-tune word embedding: %s' % args.finetune_embedding)

    model = BilinearAVG(
        len(concept_ids),
        len(relation_ids),
        n_embed,
        args.relation_unit,
        args.dropout,
        embedding=embedding
    )
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    if args.embedding != '' and not args.finetune_embedding:
        print('Freezing word embeddings...')
        model.concept_encoder.disable_update()

    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    updater = training.StandardUpdater(
        train_iter,
        optimizer,
        converter=fact_pad_concat_convert,
        device=args.gpu
    )
    trainer = training.Trainer(updater, (args.epoch, 'epoch'))
    trainer.extend(
        extensions.LogReport(trigger=(args.log_interval, 'iteration'))
    )
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'iteration', 'main/loss', 'validation/main/loss',
             'validation/main/accuracy', 'validation/main/threshold',
             'elapsed_time']
        ),
        trigger=(args.log_interval, 'iteration')
    )

    if args.validation1 and args.validation2:
        test_facts = load_data(
            concept_ids,
            relation_ids,
            args.validation1
        )
        test_data1 = [(h, r, t, y)
                      for h, r, t, y in six.moves.zip(*test_facts)]
        test_head_unk = calculate_unknown_ratio(
            [h for h, _, _, _ in test_data1]
        )
        test_relation_unk = calculate_unknown_ratio(
            [r for _, r, _, _ in test_data1]
        )
        test_tail_unk = calculate_unknown_ratio(
            [t for _, _, t, _ in test_data1]
        )
        print('Validation data: %d' % len(test_data1))
        print('Validation head unknown: %.2f' % test_head_unk)
        print('Validation relation unknown: %.2f' % test_relation_unk)
        print('Validation tail unknown: %.2f' % test_tail_unk)

        test_facts = load_data(
            concept_ids,
            relation_ids,
            args.validation2
        )
        test_data2 = [(h, r, t, y)
                      for h, r, t, y in six.moves.zip(*test_facts)]
        test_head_unk = calculate_unknown_ratio(
            [h for h, _, _, _ in test_data2]
        )
        test_relation_unk = calculate_unknown_ratio(
            [r for _, r, _, _ in test_data2]
        )
        test_tail_unk = calculate_unknown_ratio(
            [t for _, _, t, _ in test_data2]
        )
        print('Validation data: %d' % len(test_data2))
        print('Validation head unknown: %.2f' % test_head_unk)
        print('Validation relation unknown: %.2f' % test_relation_unk)
        print('Validation tail unknown: %.2f' % test_tail_unk)

        trainer.extend(
            CalculateAccuracy(
                model, test_data1, test_data2, device=args.gpu,
                key_accuracy='validation/main/accuracy',
                key_threshold='validation/main/threshold'
            ),
            trigger=(args.validation_interval, 'iteration')
        )

    print('start training')
    trainer.run()
示例#21
0
# Create date : 2020/12/31 17:07
# IDE         : pycharm
#=====================================
import tensorflow as tf
import os

from model_lstm_crf import MyModel
from utils import DataProcessor_LSTM as DataProcessor
from utils import load_vocabulary
from utils import extract_kvpairs_in_bio
from utils import cal_f1_score

os.environ['CUDA_VISIBLE_DEVICES'] = '3'
lstm_crf_ckpt = "models/"
base_dir = "./data/ner_data"
w2i_char, i2w_char = load_vocabulary(os.path.join(base_dir, "vocab.txt"))
w2i_bio, i2w_bio = load_vocabulary(os.path.join(base_dir, "vocab_bio.txt"))


data_processor = DataProcessor(
    os.path.join(base_dir, 'valid.txt'),
    os.path.join(base_dir, "valid_bio.txt"),
    w2i_char,
    w2i_bio,
    shuffling=True
)

model = MyModel(embedding_dim=300,
                hidden_dim=300,
                vocab_size_char=len(w2i_char),
                vocab_size_bio=len(w2i_bio),
示例#22
0
elif config.dataset == 'atis':
    print('use atis dataset')

model_name = 'capsule'

from utils import build_dataset, build_iterator, get_time_dif, load_vocabulary, build_vocab

torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True
start_time = time.time()
print('加载数据...')
build_vocab(config.input_file, os.path.join(config.vocab_path, 'in_vocab'))
build_vocab(config.slot_file, os.path.join(config.vocab_path, 'slot_vocab'))
build_vocab(config.intent_file, os.path.join(config.vocab_path, 'intent_vocab'), pad=False, unk=False)
in_vocab = load_vocabulary(os.path.join(config.vocab_path, 'in_vocab'))
slot_vocab = load_vocabulary(os.path.join(config.vocab_path, 'slot_vocab'))
intent_vocab = load_vocabulary(os.path.join(config.vocab_path, 'intent_vocab'))
train_data, dev_data, test_data = build_dataset(in_vocab['vocab'], slot_vocab['vocab'], intent_vocab['vocab'])

train_iter = build_iterator(train_data)
dev_iter = build_iterator(dev_data)
test_iter = build_iterator(test_data)
time_dif = get_time_dif(start_time)
print('time usage:', time_dif)

config.n_vocab = len(in_vocab['vocab'])

x = import_module(model_name)
model = x.Model(config).to(torch.device('cuda'))
init_network(model)
示例#23
0
    if os.path.exists(log_file_path):
        os.remove(log_file_path)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s | %(message)s",
                                  "%Y-%m-%d %H:%M:%S")
    chlr = logging.StreamHandler()
    chlr.setFormatter(formatter)
    fhlr = logging.FileHandler(log_file_path)
    fhlr.setFormatter(formatter)
    logger.addHandler(chlr)
    logger.addHandler(fhlr)

    logger.info("loading vocab...")
    w2i_char, i2w_char = load_vocabulary("data/vocab_char.txt")
    w2i_bio, i2w_bio = load_vocabulary("data/vocab_bio.txt")
    w2i_attr, i2w_attr = load_vocabulary("data/vocab_attr.txt")

    logger.info("loading data...")
    data_processor_train = DataProcessor("data/train/input.seq.char",
                                         "data/train/output.seq.bio",
                                         "data/train/output.seq.attr",
                                         w2i_char,
                                         w2i_bio,
                                         w2i_attr,
                                         shuffling=True)
    data_processor_valid = DataProcessor("data/test/input.seq.char",
                                         "data/test/output.seq.bio",
                                         "data/test/output.seq.attr",
                                         w2i_char,
示例#24
0
文件: train.py 项目: hrlinlp/JAVE-1
    "img_block_num": 49,  # # num of regional image features  (7×7=49)
    "attn_size": 200,  # hidden dim in attention
    "batch_size": 128,  # batch size
    "dropout_prob": 0  # probability of dropout layers
}

paths = {
    "ckpt": "./ckpt/model.ckpt",
    "vocab": "./vocab",
    "embedded": "./data/embedded",
    "train_data": "./data/train",
    "valid_data": "./data/valid",
    "test_data": "./data/test"
}

w2i_word, i2w_word = load_vocabulary(paths["vocab"] + "/vocab.word")
w2i_bio, i2w_bio = load_vocabulary(paths["vocab"] + "/vocab.bio")
w2i_label, i2w_label = load_vocabulary(paths["vocab"] + "/vocab.label")

# embedding_container: restore all vectors encoded by pre-trained bert and resnet
embedding_container = EmbeddingContainer(
    paths["embedded"] +
    "/sids_of_txts",  # indexes to find text encoded vector 
    paths["embedded"] +
    "/txts.embedded.npy",  # text encoded by pre-trained bert, shape=[N, max_len_of_word_seqs, dim_of_bert_output]
    paths["embedded"] +
    "/txts.embeddedG.npy",  # vectors of [CLS] encoded by a pre-trained bert, shape=[N, dim_of_bert_output]
    paths["embedded"] +
    "/cids_of_imgs",  # indexes to find image encoded vector 
    paths["embedded"] +
    "/imgs.embedded.npy",  # image encoded by pre-trained resnet, shape=[N, image_region_num, dim_of_resnet_output]
示例#25
0
# set logging
log_file_path = "./ckpt/run.log"
if os.path.exists(log_file_path): os.remove(log_file_path)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(message)s", "%Y-%m-%d %H:%M:%S")
chlr = logging.StreamHandler()
chlr.setFormatter(formatter)
fhlr = logging.FileHandler(log_file_path)
fhlr.setFormatter(formatter)
logger.addHandler(chlr)
logger.addHandler(fhlr)

logger.info("loading vocab...")

w2i_char, i2w_char = load_vocabulary("./data/vocab_char.txt")
w2i_bio, i2w_bio = load_vocabulary("./data/vocab_bioattr.txt")

logger.info("loading data...")

data_processor_train = DataProcessor("./data/train/input.seq.char",
                                     "./data/train/output.seq.bioattr",
                                     w2i_char,
                                     w2i_bio,
                                     shuffling=True)

data_processor_valid = DataProcessor("./data/test/input.seq.char",
                                     "./data/test/output.seq.bioattr",
                                     w2i_char,
                                     w2i_bio,
                                     shuffling=True)
示例#26
0
def main():
    parser = argparse.ArgumentParser(description='Attention-based NMT')
    parser.add_argument('SOURCE', help='source sentence list')
    parser.add_argument('TARGET', help='target sentence list')
    parser.add_argument('SOURCE_VOCAB', help='source vocabulary file')
    parser.add_argument('TARGET_VOCAB', help='target vocabulary file')
    parser.add_argument('--validation-source',
                        help='source sentence list for validation')
    parser.add_argument('--validation-target',
                        help='target sentence list for validation')
    parser.add_argument('--batchsize', '-b', type=int, default=128,
                        help='number of sentence pairs in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=20,
                        help='number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--resume', '-r', default='',
                        help='resume the training from snapshot')
    parser.add_argument('--encoder-unit', type=int, default=128,
                        help='number of units')
    parser.add_argument('--encoder-layer', type=int, default=3,
                        help='number of layers')
    parser.add_argument('--encoder-dropout', type=int, default=0.1,
                        help='number of layers')
    parser.add_argument('--decoder-unit', type=int, default=128,
                        help='number of units')
    parser.add_argument('--attention-unit', type=int, default=128,
                        help='number of units')
    parser.add_argument('--maxout-unit', type=int, default=128,
                        help='number of units')
    parser.add_argument('--min-source-sentence', type=int, default=1,
                        help='minimium length of source sentence')
    parser.add_argument('--max-source-sentence', type=int, default=50,
                        help='maximum length of source sentence')
    parser.add_argument('--log-interval', type=int, default=200,
                        help='number of iteration to show log')
    parser.add_argument('--validation-interval', type=int, default=4000,
                        help='number of iteration to evlauate the model '
                        'with validation dataset')
    parser.add_argument('--out', '-o', default='result',
                        help='directory to output the result')
    parser.add_argument('--debug', action='store_true',
                        help='use a small part of training data')
    args = parser.parse_args()

    source_ids = load_vocabulary(args.SOURCE_VOCAB)
    target_ids = load_vocabulary(args.TARGET_VOCAB)
    train_source = load_data(source_ids, args.SOURCE, debug=args.debug)
    train_target = load_data(target_ids, args.TARGET, debug=args.debug)
    assert len(train_source) == len(train_target)
    train_data = [(s, t)
                  for s, t in six.moves.zip(train_source, train_target)
                  if args.min_source_sentence <= len(s)
                  <= args.max_source_sentence and
                  args.min_source_sentence <= len(t)
                  <= args.max_source_sentence]
    train_source_unk = calculate_unknown_ratio(
        [s for s, _ in train_data]
    )
    train_target_unk = calculate_unknown_ratio(
        [t for _, t in train_data]
    )

    print('Source vocabulary size: {}'.format(len(source_ids)))
    print('Target vocabulary size: {}'.format(len(target_ids)))
    print('Train data size: {}'.format(len(train_data)))
    print('Train source unknown: {0:.2f}'.format(train_source_unk))
    print('Train target unknown: {0:.2f}'.format(train_target_unk))

    target_words = {i: w for w, i in target_ids.items()}
    source_words = {i: w for w, i in source_ids.items()}

    model = Seq2seq(len(source_ids), len(target_ids), args.encoder_layer,
                    args.encoder_unit, args.encoder_dropout,
                    args.decoder_unit, args.attention_unit, args.maxout_unit)
    if args.gpu >= 0:
        chainer.cuda.get_device(args.gpu).use()
        model.to_gpu(args.gpu)

    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize)
    updater = training.StandardUpdater(
        train_iter, optimizer, converter=seq2seq_pad_concat_convert,
        device=args.gpu
    )
    trainer = training.Trainer(updater, (args.epoch, 'epoch'))
    trainer.extend(
        extensions.LogReport(trigger=(args.log_interval, 'iteration'))
    )
    trainer.extend(
        extensions.PrintReport(
            ['epoch', 'iteration', 'main/loss', 'validation/main/loss',
             'main/perp', 'validation/main/perp', 'validation/main/bleu',
             'elapsed_time']
        ),
        trigger=(args.log_interval, 'iteration')
    )

    if args.validation_source and args.validation_target:
        test_source = load_data(source_ids, args.validation_source)
        test_target = load_data(target_ids, args.validation_target)
        assert len(test_source) == len(test_target)
        test_data = list(six.moves.zip(test_source, test_target))
        test_data = [(s, t) for s, t in test_data if 0 < len(s) and 0 < len(t)]
        test_source_unk = calculate_unknown_ratio(
            [s for s, _ in test_data]
        )
        test_target_unk = calculate_unknown_ratio(
            [t for _, t in test_data]
        )

        print('Validation data: {}'.format(len(test_data)))
        print('Validation source unknown: {0:.2f}'.format(test_source_unk))
        print('Validation target unknown: {0:.2f}'.format(test_target_unk))

        @chainer.training.make_extension()
        def translate(_):
            source, target = seq2seq_pad_concat_convert(
                [test_data[numpy.random.choice(len(test_data))]],
                args.gpu
            )
            result = model.translate(source)[0].reshape(1, -1)

            source, target, result = source[0], target[0], result[0]

            source_sentence = ' '.join([source_words[int(x)] for x in source])
            target_sentence = ' '.join([target_words[int(y)] for y in target])
            result_sentence = ' '.join([target_words[int(y)] for y in result])
            print('# source : ' + source_sentence)
            print('# result : ' + result_sentence)
            print('# expect : ' + target_sentence)

        trainer.extend(
            translate,
            trigger=(args.validation_interval, 'iteration')
        )

        trainer.extend(
            CalculateBleu(
                model, test_data, device=args.gpu,
                key='validation/main/bleu'
            ),
            trigger=(args.validation_interval, 'iteration')
        )

    print('start training')
    trainer.run()
    chainer.serializers.save_npz('%s/model.npz' % args.out, model)
else:
    print("use own dataset: ", arg.dataset)

full_train_path = os.path.join("./data", arg.dataset, arg.train_data_path)
full_test_path = os.path.join('./data', arg.dataset, arg.test_data_path)
full_valid_path = os.path.join('./data', arg.dataset, arg.valid_data_path)

create_vocabulary(os.path.join(full_train_path, arg.input_file),
                  os.path.join(arg.vocab_path, "in_vocab"))
create_vocabulary(os.path.join(full_train_path, arg.slot_file),
                  os.path.join(arg.vocab_path, "slot_vocab"))
create_vocabulary(os.path.join(full_train_path, arg.intent_file),
                  os.path.join(arg.vocab_path, "intent_vocab"))

# {word 2 id, words list}
in_vocab = load_vocabulary(os.path.join(arg.vocab_path, "in_vocab"))
slot_vocab = load_vocabulary(os.path.join(arg.vocab_path, "slot_vocab"))
intent_vocab = load_vocabulary(os.path.join(arg.vocab_path, "intent_vocab"))


def create_model(input_data,
                 input_size,
                 sequence_length,
                 slot_size,
                 intent_size,
                 layer_size=128,
                 is_training=True):
    """
    input_data: 输入数据[batch, len]
    input_size: 输入数据中单词的个数
    sequence_length: 数据的长度[batch]
示例#28
0
# set logging
log_file_path = "./ckpt/run.log"
if os.path.exists(log_file_path): os.remove(log_file_path)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(message)s", "%Y-%m-%d %H:%M:%S")
chlr = logging.StreamHandler()
chlr.setFormatter(formatter)
fhlr = logging.FileHandler(log_file_path)
fhlr.setFormatter(formatter)
logger.addHandler(chlr)
logger.addHandler(fhlr)

logger.info("loading vocab...")

w2i_char, i2w_char = load_vocabulary("./data/vocab_char.txt")
w2i_word, i2w_word = load_vocabulary("./data/vocab_word.txt")
w2i_bio, i2w_bio = load_vocabulary("./data/vocab_bio.txt")
w2i_attr, i2w_attr = load_vocabulary("./data/vocab_attr.txt")

logger.info("loading data...")

data_processor_train = DataProcessor("./data/train/input.seq.char",
                                     "./data/train/input.seq.word",
                                     "./data/train/output.seq.bio",
                                     "./data/train/output.seq.attr",
                                     w2i_char,
                                     w2i_word,
                                     w2i_bio,
                                     w2i_attr,
                                     shuffling=True)
示例#29
0
def main():
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--hdim', default=512, type=int)
    parser.add_argument('--grad_clip', default=100, type=int)
    parser.add_argument('--lr', default=0.01, type=float)
    parser.add_argument('--batch_size', default=50, type=int)
    parser.add_argument('--num_epochs', default=50, type=int)
    parser.add_argument('--seq_len', default=60, type=int)
    parser.add_argument('--depth', default=1, type=int)
    parser.add_argument('--model', default=None)
    parser.add_argument('--model_name_prefix', default='model')
    parser.add_argument('--language', default='hy-AM')
    parser.add_argument('--start_from', default=0, type=float)
    args = parser.parse_args()
   
    print("Loading Files")
    
    (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language = args.language)
    (train_text, val_text, trans) = utils.load_language_data(language = args.language)
    data_size = len(train_text)
    
    print("Building Network ...")
   
    (output_layer, train, cost) = utils.define_model(args.hdim, args.depth, args.lr, args.grad_clip, trans_vocab_size, vocab_size, is_train = True)
    
    if args.model:
        f = np.load('languages/' + args.language + '/models/' + args.model)
        param_values = [np.float32(f[i]) for i in range(len(f))]
        lasagne.layers.set_all_param_values(output_layer, param_values)
    
    print("Training ...")
    step_cnt = 0
    date_at_beginning = datetime.now()
    last_time = date_at_beginning
    for epoch in range(args.num_epochs):
        train_text = train_text.split(u'։')
        random.shuffle(train_text)
        train_text = u'։'.join(train_text)
        avg_cost = 0.0
        count = 0
        num_of_samples = 0
        num_of_chars = 0
        for (x, y) in utils.data_generator(train_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = True):
            sample_cost = train(x, np.reshape(y,(-1,vocab_size)))
            sample_cost = float(sample_cost)
            count += 1
            num_of_samples += x.shape[0]
            num_of_chars += x.shape[0] * x.shape[1]
            
            time_now = datetime.now()
            if (time_now - last_time).total_seconds() > 60 * 1: # 10 minutes
                print('Computing validation loss...')
                val_cost = 0.0
                val_count = 0.0
                for ((x_val, y_val, indices, delimiters), non_valids_list) in utils.data_generator(val_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = False):
                    val_cost += x_val.shape[0] *cost(x_val,np.reshape(y_val,(-1,vocab_size)))
                    val_count += x_val.shape[0]
                print('Validation loss is {}'.format(val_cost/val_count))
                
                file_name = 'languages/{}/models/{}.hdim{}.depth{}.seq_len{}.bs{}.time{:4f}.epoch{}.loss{:.4f}'.format(args.language, args.model_name_prefix, args.hdim, args.depth, args.seq_len, args.batch_size, (time_now - date_at_beginning).total_seconds()/60, epoch, val_cost/val_count)
                print("saving to -> " + file_name)
                np.save(file_name, lasagne.layers.get_all_param_values(output_layer))
                last_time = datetime.now()
            
            print("On step #{} loss is {:.4f}, samples passed {}, chars_passed {}, {:.4f}% of an epoch {} time passed {:4f}"\
                  .format(count, sample_cost, num_of_samples, num_of_chars, 100.0*num_of_chars/len(train_text), epoch, (time_now - date_at_beginning).total_seconds()/60.0))
                  
            avg_cost += sample_cost
                    default=0,
                    help='1 for test, or for training')
parser.add_argument('--seed', type=int, default=1111, help='random seed')
parser.add_argument('--resume',
                    default='insurance/V2/checkpoints/model_best.tar',
                    type=str,
                    metavar='PATH',
                    help='path saved params')
args = parser.parse_args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)

PAD = '<PAD>'
id_to_word, label_to_ans, label_to_ans_text = load_vocabulary(
    'insuranceQA/V2/vocabulary',
    'insuranceQA/V2/InsuranceQA.label2answer.token.encoded')
w2i = {w: i for i, w in enumerate(id_to_word.values(), 1)}
w2i[PAD] = 0
vocab_size = len(w2i)
print('vocab_size:', vocab_size)

train_data = load_data(
    'insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.train.encoded',
    id_to_word, label_to_ans_text)
test_data = load_data2(
    'insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.test.encoded',
    id_to_word, label_to_ans_text)
print('n_train:', len(train_data))
print('n_test:', len(test_data))
示例#31
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--hdim', default=512, type=int)
    parser.add_argument('--grad_clip', default=100, type=int)
    parser.add_argument('--lr', default=0.01, type=float)
    parser.add_argument('--batch_size', default=50, type=int)
    parser.add_argument('--num_epochs', default=10, type=int)
    parser.add_argument('--seq_len', default=60, type=int)
    parser.add_argument('--depth', default=1, type=int)
    parser.add_argument('--model', default=None)
    parser.add_argument('--model_name_prefix', default='model')
    parser.add_argument('--language', default='hy-AM')
    parser.add_argument('--start_from', default=0, type=float)
    args = parser.parse_args()

    print("Loading Files")

    (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans,
     trans_vocab_size) = utils.load_vocabulary(language=args.language)
    (train_text, val_text,
     trans) = utils.load_language_data(language=args.language)
    data_size = len(train_text)

    print("Building Network ...")

    (output_layer, train, cost) = utils.define_model(args.hdim,
                                                     args.depth,
                                                     args.lr,
                                                     args.grad_clip,
                                                     trans_vocab_size,
                                                     vocab_size,
                                                     is_train=True)

    if args.model:
        f = np.load('languages/' + args.language + '/models/' + args.model)
        param_values = [np.float32(f[i]) for i in range(len(f))]
        lasagne.layers.set_all_param_values(output_layer, param_values)

    print("Training ...")
    p = int(len(train_text) * args.start_from) + 1
    step_cnt = 0
    avg_cost = 0
    it = 0
    while it < args.num_epochs:
        avg_cost = 0
        date_at_beginning = datetime.now()
        non_native_skipped = 0
        for _ in range(PRINT_FREQ):
            x, y, p, turned, non_native_sequences = utils.gen_data(
                p, args.seq_len, args.batch_size, train_text, trans,
                trans_to_index, char_to_index)
            if turned:
                it += 1
            avg_cost += train(x, np.reshape(y, (-1, vocab_size)))
            non_native_skipped += non_native_sequences
        date_after = datetime.now()
        print("Epoch {} average loss = {} Time {} sec. Nonnatives skipped {}".
              format(1.0 * it + 1.0 * p / data_size, avg_cost / PRINT_FREQ,
                     (date_after - date_at_beginning).total_seconds(),
                     non_native_skipped))

        step_cnt += 1
        if True:  #step_cnt * args.batch_size > 100000:
            print('computing validation loss...')
            val_turned = False
            val_p = 0
            val_steps = 0.
            val_cost = 0.
            while not val_turned:
                x, y, val_p, val_turned, non_native = utils.gen_data(
                    val_p, args.seq_len, args.batch_size, val_text, trans,
                    trans_to_index, char_to_index)
                val_steps += 1
                val_cost += cost(x, np.reshape(y, (-1, vocab_size)))
            print('validation loss is ' + str(val_cost / val_steps))
            file_name = 'languages/' + args.language + '/models/' + args.model_name_prefix + '.hdim' + str(
                args.hdim) + '.depth' + str(args.depth) + '.seq_len' + str(
                    args.seq_len) + '.bs' + str(
                        args.batch_size) + '.epoch' + str(
                            1.0 * it + 1.0 * p / data_size) + '.loss' + str(
                                avg_cost / PRINT_FREQ) + '.npz'
            print("saving to -> " + file_name)
            np.save(file_name,
                    lasagne.layers.get_all_param_values(output_layer))
            step_cnt = 0
示例#32
0
    np.random.seed(conf.RANDOM_SEED)

    t0 = time()

    ### convert data ###
    if not os.path.exists(
            "/Users/mayili/Documents/intern/NLP/punctuation/punctuator-master/data"
    ):

        print("Converting data...\n")

        os.makedirs(
            "/Users/mayili/Documents/intern/NLP/punctuation/punctuator-master/data"
        )

        vocabulary = utils.load_vocabulary(conf.VOCABULARY_FILE)

        converter.convert_files(conf.PHASE1["TRAIN_DATA"], vocabulary,
                                conf.PUNCTUATIONS, conf.BATCH_SIZE, False,
                                PHASE1_TRAIN_PATH)
        converter.convert_files(conf.PHASE1["DEV_DATA"], vocabulary,
                                conf.PUNCTUATIONS, conf.BATCH_SIZE, False,
                                PHASE1_DEV_PATH)
# =============================================================================
#         if conf.PHASE2["TRAIN_DATA"] and conf.PHASE2["DEV_DATA"]:
#             converter.convert_files(conf.PHASE2["TRAIN_DATA"], vocabulary, conf.PUNCTUATIONS, conf.BATCH_SIZE, conf.PHASE2["USE_PAUSES"], PHASE2_TRAIN_PATH)
#             converter.convert_files(conf.PHASE2["DEV_DATA"], vocabulary, conf.PUNCTUATIONS, conf.BATCH_SIZE, conf.PHASE2["USE_PAUSES"], PHASE2_DEV_PATH)
# =============================================================================

### train model ###
    print("Training model...\n")