Пример #1
0
    def __init__(self,
                 vocab_size,
                 embed_size,
                 hidden_size,
                 output_size,
                 pre_word_embed=None,
                 dropout=0.5,
                 use_gpu=False):
        super(Intent_Model, self).__init__()
        self.use_gpu = use_gpu
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.dropout = nn.Dropout(dropout)
        self.word_embed = nn.Embedding(vocab_size, embed_size)

        if pre_word_embed is not None:
            self.word_embed.weight = nn.Parameter(
                torch.FloatTensor(pre_word_embed))
            self.pre_word_embed = True
        else:
            self.pre_word_embed = False
            init_embedding(self.word_embed.weight)

        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            bidirectional=True,
                            batch_first=True)
        init_lstm(self.lstm)
        self.output_layer = nn.Linear(hidden_size * 2, 13)
        init_linear(self.output_layer)
Пример #2
0
    def rand_init(self, init_char_embedding=True, init_word_embedding=False):
        """
        random initialization

        args:
            init_char_embedding: random initialize char embedding or not
            init_word_embedding: random initialize word embedding or not
        """

        if init_char_embedding:
            utils.init_embedding(self.char_embeds.weight)
        if init_word_embedding:
            utils.init_embedding(self.word_embeds.weight)
        if self.if_highway:
            self.forw2char.rand_init()
            self.back2char.rand_init()
            self.forw2word.rand_init()
            self.back2word.rand_init()
            self.fb2char.rand_init()
        utils.init_lstm(self.forw_char_lstm)
        utils.init_lstm(self.back_char_lstm)
        utils.init_lstm(self.word_lstm)
        utils.init_linear(self.char_pre_train_out)
        utils.init_linear(self.word_pre_train_out)
        self.crf.rand_init()
Пример #3
0
    def rand_init(self, init_embedding=False):
        """
        random initialization

        args:
            init_embedding: random initialize word embedding or not
        """
        if init_embedding:
            utils.init_embedding(self.word_embeds.weight)
        if self.position:
            utils.init_embedding(self.position_embeds.weight)
        utils.init_lstm(self.lstm)
        utils.init_linear(self.att2out)
Пример #4
0
    def get_embed(self, fembed):
        with open(fembed, 'r') as f:
            lines = [line for line in f]
        splits = [line.split() for line in lines]
        # 获取预训练数据中的词汇和嵌入矩阵
        words, embed = zip(*[(split[0], list(map(float, split[1:])))
                             for split in splits])
        # 扩充词汇
        self.extend(words)
        # 初始化词嵌入
        embed = torch.tensor(embed, dtype=torch.float)
        embed_indices = [self.wdict[w] for w in words]
        extended_embed = torch.Tensor(self.n_words, embed.size(1))
        init_embedding(extended_embed)
        extended_embed[embed_indices] = embed

        return extended_embed
Пример #5
0
    def get_embed(self, fembed):
        with open(fembed, 'r') as f:
            lines = [line for line in f]
        splits = [line.split() for line in lines]
        # 获取预训练数据中的词汇和嵌入矩阵
        words, embed = zip(*[
            (split[0], list(map(float, split[1:]))) for split in splits
        ])
        # 扩充词汇
        self.extend(words)
        # 初始化词嵌入
        embed = torch.tensor(embed, dtype=torch.float)
        embed_indices = [self.wdict[w] for w in words]
        extended_embed = torch.Tensor(self.n_words, embed.size(1))
        init_embedding(extended_embed)
        extended_embed[embed_indices] = embed

        return extended_embed
Пример #6
0
    def rand_init(self, init_embedding=False):
        """
        random initialization

        args:
            init_embedding: random initialize word embedding or not
        """
        if init_embedding:
            utils.init_embedding(self.word_embeds.weight)

        if self.position:
            utils.init_embedding(self.position_embeds.weight)

        if self.enable_att:
            self.attention.rand_init()

        # initialize tree
        self.treernn.rand_init()

        # initialize linear layer
        utils.init_linear(self.linear)
Пример #7
0
def _test(train_args, pretrain_args, args):
    """Test saved model on specified speakers."""
    print('Testing', ', '.join(args.speakers), '...')

    # update args with new test args
    test_args = utils.set_new_args(train_args, args)
    # get test data and id_to_word lookup
    _, _, test_data, id_to_word = data_reader.get_data(test_args)
    # set configurations/hyperparameters for model
    _, test_config = utils.set_config(test_args, id_to_word)

    # model requires init embed but this will be overridden by restored model
    init_embed = utils.init_embedding(id_to_word,
                                      dim=test_args.embed_size,
                                      init_scale=test_args.init_scale,
                                      embed_path=test_args.embed_path)

    with tf.Graph().as_default():
        with tf.name_scope('Test'):
            with tf.variable_scope('Model', reuse=None):
                m_test = model.Model(test_args,
                                     is_training=False,
                                     config=test_config,
                                     init_embed=init_embed,
                                     name='Test')
                m_test.build_graph()

        init = tf.global_variables_initializer()

        # if pretrained, must create dict to initialize TF Saver
        if bool(pretrain_args):
            # get trainable variables and convert to dict for Saver
            reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
            # create saver for TF session (see function for addl details)
            saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict)
        else:
            saver = tf.train.Saver()

        with tf.Session() as sess:
            sess.run(init)
            print('Restoring model...')
            saver.restore(sess, test_args.load_path)

            # test model on specified speakers
            for test_ind, test_speaker in enumerate(test_args.speakers):
                for train_ind, train_speaker in enumerate(train_args.speakers):
                    print('Testing {0} with {1} model'.format(
                        test_speaker, train_speaker))
                    test_perplexity = _run_epoch(sess, m_test, test_args,
                                                 test_data, train_ind,
                                                 test_ind)
                    print('Test Perplexity: {0:.3f}'.format(test_perplexity))
Пример #8
0
    def __init__(self, vocab_size, embed_size, hidden_size, tag2id, pre_word_embed=None, dropout=0.5, use_gpu=False):
        super(BiLSTM_CRF, self).__init__()
        self.use_gpu = use_gpu
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.tag2id = tag2id
        self.tag_size = len(tag2id)

        # cnn after
        # self.cnn = CNN_Encoder(hidden_size, hidden_size)
        # self.bridge = nn.Linear(hidden_size*2, self.hidden_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True)

        # cnn before
        # self.cnn = CNN_Encoder(embed_size, hidden_size)
        # self.bridge = nn.Linear(hidden_size * 2, self.hidden_size)
        # self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True)

        self.dropout = nn.Dropout(dropout)

        self.word_embed = nn.Embedding(vocab_size, embed_size)
        if pre_word_embed is not None:
            self.word_embed.weight = nn.Parameter(torch.FloatTensor(pre_word_embed))
            self.pre_word_embed = True
        else:
            self.pre_word_embed = False
            init_embedding(self.word_embed.weight)


        init_lstm(self.lstm)
        self.hidden2tag = nn.Linear(hidden_size*2, self.tag_size)
        init_linear(self.hidden2tag)
        self.tanh = nn.Tanh()

        # crf layer
        self.transitions = nn.Parameter(torch.zeros(self.tag_size, self.tag_size))
        self.transitions.data[tag2id['START'], :] = -10000
        self.transitions.data[:, tag2id['STOP']] = -10000
Пример #9
0
def _generate(train_args, pretrain_args, args):
    """Restore trained model and use to generate sample text."""
    # update args with new generate args
    gen_args = utils.set_new_args(train_args, args)
    # get id_to_word lookup
    _, _, _, id_to_word = data_reader.get_data(gen_args)
    # # get hyperparameters corresponding to text generation
    gen_config, _ = utils.set_config(gen_args, id_to_word)

    # model requires init embed but this will be overridden by restored model
    init_embed = utils.init_embedding(id_to_word,
                                      dim=gen_args.embed_size,
                                      init_scale=gen_args.init_scale,
                                      embed_path=gen_args.embed_path)

    with tf.Graph().as_default():
        # use Train name scope as this contains trained model parameters
        with tf.name_scope('Train'):
            with tf.variable_scope('Model', reuse=None):
                m_gen = model.Model(gen_args,
                                    is_training=False,
                                    config=gen_config,
                                    init_embed=init_embed,
                                    name='Generate')
                m_gen.build_graph()

        init = tf.global_variables_initializer()

        # if pretrained, must create dict to initialize TF Saver
        if bool(pretrain_args):
            # get trainable variables and convert to dict for Saver
            reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
            reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])
            # create saver for TF session (see function for addl details)
            saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict)
        else:
            saver = tf.train.Saver()

        with tf.Session() as sess:
            sess.run(init)
            print('Restoring model...')
            saver.restore(sess, gen_args.load_path)

            # generate text for all specified speakers
            for gen_ind, gen_speaker in enumerate(gen_args.speakers):
                print('Generating text for {0}'.format(gen_speaker))
                for train_ind, train_speaker in enumerate(train_args.speakers):
                    if gen_speaker == train_speaker:
                        generate_text(sess, m_gen, id_to_word, train_ind,
                                      args.temp)
Пример #10
0
    def __init__(self, vocab_size, embed_size, hidden_size, tag2id, pre_word_embed=None, dropout=0.5, use_gpu=False):
        super(BiLSTM, self).__init__()
        self.use_gpu = use_gpu
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.tag2id = tag2id
        self.tag_size = len(tag2id)

        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

        self.word_embed = nn.Embedding(vocab_size, embed_size)
        if pre_word_embed is not None:
            self.word_embed.weight = nn.Parameter(torch.FloatTensor(pre_word_embed))
            self.pre_word_embed = True
        else:
            self.pre_word_embed = False
            init_embedding(self.word_embed.weight)

        init_lstm(self.lstm)
        self.hidden2tag = nn.Linear(hidden_size * 2, self.tag_size)
        init_linear(self.hidden2tag)
Пример #11
0
    def rand_init(self):
        """
        random initialization

        args:
            init_char_embedding: random initialize char embedding or not

        """

        utils.init_embedding(self.char_embeds.weight)
        if self.char_lstm:
            utils.init_lstm(self.forw_char_lstm)
            utils.init_lstm(self.back_char_lstm)
            utils.init_lstm(self.word_lstm_lm)
            utils.init_linear(self.char_pre_train_out)
            utils.init_linear(self.word_pre_train_out)
            if self.if_highway:
                self.forw2char.rand_init()
                self.back2char.rand_init()
                self.forw2word.rand_init()
                self.back2word.rand_init()
                self.fb2char.rand_init()
        else:
            utils.init_lstm(self.word_lstm_cnn)
Пример #12
0
    def rand_init(self):
        """
        random initialization

        args:
            init_char_embedding: random initialize char embedding or not

        """

        utils.init_embedding(self.char_embeds.weight)
        if self.char_lstm:
            utils.init_lstm(self.forw_char_lstm)
            utils.init_lstm(self.back_char_lstm)
            utils.init_lstm(self.word_lstm_lm)
            utils.init_linear(self.char_pre_train_out)
            utils.init_linear(self.word_pre_train_out)
            if self.if_highway:
                self.forw2char.rand_init()
                self.back2char.rand_init()
                self.forw2word.rand_init()
                self.back2word.rand_init()
                self.fb2char.rand_init()
        else:
            utils.init_lstm(self.word_lstm_cnn)
Пример #13
0
def run_train():
    data = {
        'trainAnswers': [],
        'devAnswers': [],
        'trainQuestions': [],
        'devQuestions': []
    }
    data['trainAnswers'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-char-' + str(dev_id) + '/train.txt.tgt',
                 encoding="utf-8").read().strip().split('\n')))
    data['trainQuestions'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-char-' + str(dev_id) + '/train.txt.src',
                 encoding="utf-8").read().strip().split('\n')))
    data['devAnswers'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-char-' + str(dev_id) + '/val.txt.tgt',
                 encoding="utf-8").read().strip().split('\n')))
    data['devQuestions'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-char-' + str(dev_id) + '/val.txt.src',
                 encoding="utf-8").read().strip().split('\n')))

    # for debug
    '''
    data['trainAnswers'] = data['trainAnswers'][:5]
    data['trainQuestions'] = data['trainQuestions'][:5]
    data['devAnswers'] = [list(x) for x in data['trainAnswers']]
    data['devQuestions'] = [list(x) for x in data['trainQuestions']]
    '''

    # 生成词表(word)
    if os.path.exists(model_dir + '/vocab_word'):
        t = open(model_dir + '/vocab_word', 'rb')
        vocab_word = pickle.load(t)
        t.close()
    else:
        vocab_word = prepare_vocabulary(data, cut=cut)
        t = open(model_dir + '/vocab_word', 'wb')
        pickle.dump(vocab_word, t)
        t.close()
    print("========================word===========================")
    print('dec_vocab_size: ', vocab_word.n_words_for_decoder)
    print('vocab_size: ', vocab_word.n_words)
    print('max_word_length: ', max(map(lambda x: len(x),
                                       vocab_word.word2index)))

    # 生成数据(截断,生成负例)
    if os.path.exists(model_dir + '/data'):
        t = open(model_dir + '/data', 'rb')
        train_examples, dev_examples = pickle.load(t)
        t.close()
    else:
        train_examples = gen_data(vocab_word, data['trainQuestions'],
                                  data['trainAnswers'], 1, max_length,
                                  max_num_utterance)
        dev_examples = gen_data(vocab_word, data['devQuestions'],
                                data['devAnswers'], 10, max_length,
                                max_num_utterance)
        t = open(model_dir + '/data', 'wb')
        pickle.dump((train_examples, dev_examples), t)
        t.close()
    print(train_examples[0][1])
    print(train_examples[0][2])
    print(dev_examples[0][1])
    print(dev_examples[0][2])
    print("========================dataset===========================")
    print('train: ', len(train_examples[0]), len(train_examples[1]),
          len(train_examples[2]))
    print('dev: ', len(dev_examples[0]), len(dev_examples[1]),
          len(dev_examples[2]))

    embed = init_embedding(embed_size, vocab_word.n_words,
                           vocab_word.word2index, True)
    embedding = nn.Embedding(vocab_word.n_words, embed_size,
                             padding_idx=0).from_pretrained(embed,
                                                            freeze=False)
    dam = DAM(embed_size, max_num_utterance, max_length, max_stacks)

    embedding = torch.nn.DataParallel(embedding).to(device_cuda)
    dam = torch.nn.DataParallel(dam).to(device_cuda)

    if os.path.isfile(model_dir + '/embedding'):
        embedding.load_state_dict(torch.load(model_dir + '/embedding'))

    if os.path.isfile(model_dir + '/dam'):
        dam.load_state_dict(torch.load(model_dir + '/dam'))

    optimizer = optim.Adam([{
        "params": embedding.parameters()
    }, {
        "params": dam.parameters()
    }],
                           lr=lr,
                           amsgrad=True)

    if os.path.isfile(model_dir + '/optimizer'):
        optimizer.load_state_dict(torch.load(model_dir + '/optimizer'))

    trainIters(vocab_word,
               embedding,
               dam,
               optimizer,
               train_examples,
               dev_examples,
               n_epochs,
               lr,
               batch_size,
               infer_batch_size,
               print_every=1)
Пример #14
0
    args['epoch'] = 10  # 迭代次数
    args['hidden_dim'] = 100  # lstm cell输出的数据的维度
    args['optimizer'] = 'Adam'  # 优化损失函数的方法
    args['lr'] = 0.001  # 学习率
    args['clip'] = 5.0  # 限定梯度更新的时候的阈值
    args['dropout'] = 0.5  # 保留率
    args[
        'update_embedding'] = True  # 是否要对embedding进行更新,embedding初始化之后,这里设置成更新,就可以更新embedding
    args['embedding_dim'] = 100  # embedding的维度
    args['shuffle'] = True  # 是否每次在把数据送进lstm中训练时都混洗

    # 读取词典,把一个字映射到一个id,这个词典是从训练数据中得到的
    word2id = read_dict(os.path.join('.', args['train_data'], 'word2id.pkl'))

    # 随机初始化embedding
    embeddings = init_embedding(word2id, args['embedding_dim'])

    # 设置模型的输出路径
    model_path = 'BLCM3'
    output_path = os.path.join('.', model_path)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    summary_path = os.path.join(output_path, "summaries")
    if not os.path.exists(summary_path):
        os.makedirs(summary_path)
    model_path = os.path.join(output_path, "checkpoints/")
    #if not os.path.exists(model_path):
    #os.makedirs(model_path)
    ckpt_prefix = os.path.join(model_path, "model")
    result_path = os.path.join(output_path, "results")
    if not os.path.exists(result_path):
Пример #15
0
    def __init__(self,
                 vocab_size,
                 tag_to_ix,
                 embedding_dim,
                 hidden_dim,
                 char_to_ix=None,
                 pre_word_embeds=None,
                 char_out_dimension=25,
                 char_embedding_dim=25,
                 use_gpu=True,
                 use_crf=True,
                 char_mode='CNN',
                 encoder_mode='LSTM',
                 dropout=0.5):
        '''
        Input parameters:
                        vocab_size= Size of vocabulary (int)
                        tag_to_ix = Dictionary that maps NER tags to indices
                        embedding_dim = Dimension of word embeddings (int)
                        hidden_dim = The hidden dimension of the LSTM layer (int)
                        char_to_ix = Dictionary that maps characters to indices
                        pre_word_embeds = Numpy array which provides mapping from word embeddings to word indices
                        char_out_dimension = Output dimension from the CNN encoder for character
                        char_embedding_dim = Dimension of the character embeddings
                        use_gpu = defines availability of GPU,
                                when True: CUDA function calls are made
                                else: Normal CPU function calls are made
                        use_crf = parameter which decides if you want to use the CRF layer for output decoding
        '''

        super(BiLSTM_CRF, self).__init__()

        # parameter initialization for the model
        self.use_gpu = use_gpu
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.use_crf = use_crf
        self.tagset_size = len(tag_to_ix)
        self.out_channels = char_out_dimension
        self.char_mode = char_mode
        self.encoder_mode = encoder_mode
        self.char_lstm_dim = char_out_dimension

        if char_embedding_dim is not None:
            self.char_embedding_dim = char_embedding_dim

            # Initializing the character embedding layer
            self.char_embeds = nn.Embedding(len(char_to_ix),
                                            char_embedding_dim)
            init_embedding(self.char_embeds.weight)

            # Performing LSTM encoding on the character embeddings
            if self.char_mode == 'LSTM':
                self.char_lstm = nn.LSTM(char_embedding_dim,
                                         self.char_lstm_dim,
                                         num_layers=1,
                                         bidirectional=True)
                init_lstm(self.char_lstm)

            # Performing CNN encoding on the character embeddings
            if self.char_mode == 'CNN':
                self.char_cnn3 = nn.Conv2d(in_channels=1,
                                           out_channels=self.out_channels,
                                           kernel_size=(3, char_embedding_dim),
                                           padding=(2, 0))

        # Creating Embedding layer with dimension of ( number of words * dimension of each word)
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        if pre_word_embeds is not None:
            # Initializes the word embeddings with pretrained word embeddings
            self.pre_word_embeds = True
            self.word_embeds.weight = nn.Parameter(
                torch.FloatTensor(pre_word_embeds))
        else:
            self.pre_word_embeds = False

        # Initializing the dropout layer, with dropout specificed in parameters
        self.dropout = nn.Dropout(dropout)

        # Lstm Layer:
        if self.encoder_mode == 'LSTM':
            if self.char_mode == 'LSTM':
                self.lstm = nn.LSTM(embedding_dim + self.char_lstm_dim * 2,
                                    hidden_dim,
                                    bidirectional=True)
            if self.char_mode == 'CNN':
                self.lstm = nn.LSTM(embedding_dim + self.out_channels,
                                    hidden_dim,
                                    bidirectional=True)

            # Initializing the lstm layer using predefined function for initialization
            init_lstm(self.lstm)

            # Linear layer which maps the output of the bidirectional LSTM into tag space.
            self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

        # CNN (One-layer):
        if self.encoder_mode == 'CNN':
            # Conv layer
            self.conv1 = nn.Conv2d(in_channels=1,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 1),
                                   padding=(0, 0))

            # Initializing the conv layer
            nn.init.xavier_uniform_(self.conv1.weight)

            if self.char_mode == 'LSTM':
                print(
                    f'embedding_dim={embedding_dim}, char_lstm_dim={self.char_lstm_dim*2}, in={embedding_dim+self.char_lstm_dim*2}'
                )
                self.maxpool1 = nn.MaxPool2d(
                    (1, embedding_dim + self.char_lstm_dim * 2))
            if self.char_mode == 'CNN':
                print(
                    f'embedding_dim={embedding_dim}, self.out_channels={self.out_channels}, in={embedding_dim+self.out_channels}'
                )
                self.maxpool1 = nn.MaxPool2d(
                    (1, embedding_dim + self.out_channels))

        # CNN (Two-layer):
        if self.encoder_mode == 'CNN2':
            # Conv layer
            self.conv1 = nn.Conv2d(in_channels=1,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 1),
                                   padding=(0, 0))
            self.conv2 = nn.Conv2d(in_channels=hidden_dim * 2,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 1),
                                   padding=(0, 0))

            # Initializing the conv layer
            nn.init.xavier_uniform_(self.conv1.weight)
            nn.init.xavier_uniform_(self.conv2.weight)

            self.maxpool1 = nn.MaxPool2d((1, 2))
            self.maxpool2 = nn.MaxPool2d(
                (1, (embedding_dim + self.out_channels) // 2))

        # CNN (Three-layer):
        if self.encoder_mode == 'CNN3':
            # Conv layer
            self.conv1 = nn.Conv2d(in_channels=1,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 1),
                                   padding=(0, 0))
            self.conv2 = nn.Conv2d(in_channels=hidden_dim * 2,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 1),
                                   padding=(0, 0))
            self.conv3 = nn.Conv2d(in_channels=hidden_dim * 2,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 1),
                                   padding=(0, 0))

            # Initializing the conv layer
            nn.init.xavier_uniform_(self.conv1.weight)
            nn.init.xavier_uniform_(self.conv2.weight)
            nn.init.xavier_uniform_(self.conv3.weight)

            self.maxpool1 = nn.MaxPool2d((1, 2))
            self.maxpool2 = nn.MaxPool2d((1, 2))
            self.maxpool3 = nn.MaxPool2d(
                (1, (embedding_dim + self.out_channels) // 4))

        # CNN (Dilated Three-layer):
        if self.encoder_mode == 'CNN_DILATED':
            # Conv layer
            self.conv1 = nn.Conv2d(in_channels=1,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 2),
                                   padding=(0, 0),
                                   dilation=1)
            self.conv2 = nn.Conv2d(in_channels=hidden_dim * 2,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 2),
                                   padding=(0, 0),
                                   dilation=2)
            self.conv3 = nn.Conv2d(in_channels=hidden_dim * 2,
                                   out_channels=hidden_dim * 2,
                                   kernel_size=(1, 2),
                                   padding=(0, 0),
                                   dilation=3)

            # Initializing the conv layer
            nn.init.xavier_uniform_(self.conv1.weight)
            nn.init.xavier_uniform_(self.conv2.weight)
            nn.init.xavier_uniform_(self.conv3.weight)

            self.maxpool1 = nn.MaxPool2d((1, 2))
            self.maxpool2 = nn.MaxPool2d((1, 2))
            self.maxpool3 = nn.MaxPool2d((1, 27))

        # Linear layer which maps the output of the bidirectional LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size)

        # Initializing the linear layer using predefined function for initialization
        init_linear(self.hidden2tag)

        if self.use_crf:
            # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
            # Matrix has a dimension of (total number of tags * total number of tags)
            self.transitions = nn.Parameter(
                torch.zeros(self.tagset_size, self.tagset_size))

            # These two statements enforce the constraint that we never transfer
            # to the start tag and we never transfer from the stop tag
            self.transitions.data[tag_to_ix[START_TAG], :] = -10000
            self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
Пример #16
0
def train_with_earlystop(corpus,
                         device,
                         n_hidden=128,
                         n_emb=128,
                         batch_size=32,
                         use_hie=False,
                         nn_type='gru',
                         pooling_type='attention',
                         w2v_fn=None,
                         save_fn=None,
                         disp_proc=True):
    '''
    Input:
        use_hie: whether use hierarchical structure. 
        nn_type: gru, lstm, conv
        pooling_type: mean, max, attention
        use_w2v: whether to use pre-trained embeddings from word2vec
    '''
    print('%d training samples' % corpus.current_split_sizes[0])
    print('%d validation samples' % corpus.current_split_sizes[1])

    #    rng = np.random.RandomState(1224)
    #    th_rng = RandomStreams(1224)

    if save_fn is None:
        save_fn = 'model-res/%s-%s-%s.ckpt' % (nn_type, pooling_type, use_hie)

    # Load Word2Vec
    if w2v_fn is None:
        pre_embedding = None
    else:
        print('Loading word2vec model...')
        if w2v_fn == 'tencent':
            vectors = np.load(
                r'G:\word2vec\Tencent-AI-Lab\tencent-ailab-vecs-128.npy')
            word_sr = pd.read_hdf(
                r'G:\word2vec\Tencent-AI-Lab\tencent-ailab-voc.h5', 'voc')
            gensim_w2v = (vectors, word_sr)
        else:
            gensim_w2v = Word2Vec.load(w2v_fn)
        pre_embedding = init_embedding(gensim_w2v, corpus.current_dic)

    classifier = construct_classifier(corpus.current_dic.size,
                                      n_emb,
                                      n_hidden,
                                      corpus.n_target,
                                      pre_embedding=pre_embedding,
                                      use_hie=use_hie,
                                      nn_type=nn_type,
                                      pooling_type=pooling_type)
    classifier.to(device)

    # Loss and Optimizer
    loss_func = nn.NLLLoss()
    adadelta_optimizer = optim.Adadelta(classifier.parameters(),
                                        lr=1.0,
                                        rho=0.9,
                                        weight_decay=1e-8)
    sgd_optimizer = optim.SGD(classifier.parameters(),
                              lr=0.001,
                              momentum=0.9,
                              weight_decay=1e-8)

    # First step: optimize using Adadelta
    disp_freq = 20 if disp_proc is True else None
    train_corpus(classifier,
                 corpus,
                 loss_func,
                 adadelta_optimizer,
                 save_fn,
                 disp_freq=disp_freq,
                 batch_size=batch_size)

    # Retrieve the state optimized by Adadelta
    classifier.load_state_dict(torch.load(save_fn))
    # Second step: optimize using SGD
    train_corpus(classifier,
                 corpus,
                 loss_func,
                 sgd_optimizer,
                 save_fn,
                 disp_freq=disp_freq,
                 batch_size=batch_size)
Пример #17
0
def run_train():
    data = {
        'trainAnswers': [],
        'devAnswers': [],
        'trainQuestions': [],
        'devQuestions': []
    }
    data['trainAnswers'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-' + str(dev_id) + '/train.txt.tgt',
                 encoding="utf-8").read().strip().split('\n')))
    data['trainQuestions'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-' + str(dev_id) + '/train.txt.src',
                 encoding="utf-8").read().strip().split('\n')))
    data['devAnswers'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-' + str(dev_id) + '/val.txt.tgt',
                 encoding="utf-8").read().strip().split('\n')))
    data['devQuestions'].extend(
        map(
            lambda x: x.split(' '),
            open('opennmt-kb-' + str(dev_id) + '/val.txt.src',
                 encoding="utf-8").read().strip().split('\n')))

    # for debug
    '''
    data['trainAnswers'] = data['trainAnswers'][:2]
    data['trainQuestions'] = data['trainQuestions'][:2]
    data['devAnswers'] = [list(x) for x in data['trainAnswers']]
    data['devQuestions'] = [list(x) for x in data['trainQuestions']]
    '''

    # 生成词表(word)
    if os.path.exists(model_dir + '/vocab_word'):
        t = open(model_dir + '/vocab_word', 'rb')
        vocab_word = pickle.load(t)
        t.close()
    else:
        vocab_word = prepare_vocabulary(data, cut=cut)
        t = open(model_dir + '/vocab_word', 'wb')
        pickle.dump(vocab_word, t)
        t.close()
    print("========================word===========================")
    print('dec_vocab_size: ', vocab_word.n_words_for_decoder)
    print('vocab_size: ', vocab_word.n_words)
    print('max_word_length: ', max(map(lambda x: len(x),
                                       vocab_word.word2index)))

    # 生成数据(截断)
    if os.path.exists(model_dir + '/data'):
        t = open(model_dir + '/data', 'rb')
        train_pairs, dev_pairs = pickle.load(t)
        t.close()
    else:
        train_pairs = []
        dev_pairs = []
        for i in range(len(data['trainQuestions'])):
            data['trainQuestions'][i] = cut_utterances(
                data['trainQuestions'][i], max_num_utterance, max_seq_length)
            data['trainAnswers'][i] = data['trainAnswers'][i][:DEC_MAX_LEN]
            train_pairs.append(
                (data['trainQuestions'][i], data['trainAnswers'][i]))
        for i in range(len(data['devQuestions'])):
            data['devQuestions'][i] = cut_utterances(data['devQuestions'][i],
                                                     max_num_utterance,
                                                     max_seq_length)
            data['devAnswers'][i] = data['devAnswers'][i][:DEC_MAX_LEN]
            dev_pairs.append((data['devQuestions'][i], data['devAnswers'][i]))
        t = open(model_dir + '/data', 'wb')
        pickle.dump((train_pairs, dev_pairs), t)
        t.close()
    print(train_pairs[0])
    print(train_pairs[1])
    print(dev_pairs[0])
    print(dev_pairs[1])
    print("========================dataset===========================")
    print('train: ', len(train_pairs))
    print('dev: ', len(dev_pairs))

    # 共用一套embedding
    embed = init_embedding(embed_size, vocab_word.n_words,
                           vocab_word.word2index)
    embedding = nn.Embedding(vocab_word.n_words, embed_size,
                             padding_idx=0).from_pretrained(embed,
                                                            freeze=False)
    encoder = EncoderRNN(embed_size,
                         vocab_word.n_words,
                         hidden_size,
                         bidirectional=bidirectional,
                         num_layers=num_layers,
                         dropout_p=dropout_p)
    attn_decoder = CopynetDecoderRNN(embed_size,
                                     hidden_size,
                                     vocab_word.n_words_for_decoder,
                                     vocab_word.n_words,
                                     num_layers=num_layers,
                                     dropout_p=dropout_p)

    embedding = torch.nn.DataParallel(embedding).to(device_cuda)
    encoder = torch.nn.DataParallel(encoder).to(device_cuda)
    attn_decoder = torch.nn.DataParallel(attn_decoder).to(device_cuda)

    if os.path.isfile(model_dir + '/embedding'):
        embedding.load_state_dict(torch.load(model_dir + '/embedding'))

    if os.path.isfile(model_dir + '/encoder') and os.path.isfile(model_dir +
                                                                 '/decoder'):
        encoder.load_state_dict(torch.load(model_dir + '/encoder'))
        attn_decoder.load_state_dict(torch.load(model_dir + '/decoder'))

    optimizer = optim.Adam([{
        "params": embedding.parameters()
    }, {
        "params": encoder.parameters()
    }, {
        "params": attn_decoder.parameters()
    }],
                           lr=lr,
                           amsgrad=True)

    if os.path.isfile(model_dir + '/optimizer'):
        optimizer.load_state_dict(torch.load(model_dir + '/optimizer'))

    trainIters(vocab_word,
               embedding,
               encoder,
               attn_decoder,
               optimizer,
               train_pairs,
               dev_pairs,
               max_length,
               n_epochs,
               lr,
               batch_size,
               infer_batch_size,
               print_every=1)
Пример #18
0
    nn_type = 'gru'
    #    nn_type = 'lstm'
    #    nn_type = 'conv'
    #    pooling_type = 'mean'
    #    pooling_type = 'max'
    pooling_type = 'attention'

    # Load Word2Vec
    if w2v_fn is None:
        pre_embedding = None
    else:
        print('Loading word2vec model...')
        if not os.path.exists(w2v_fn):
            raise Exception('Word2Vec model does NOT exist!', w2v_fn)
        gensim_w2v = Word2Vec.load(w2v_fn)
        pre_embedding = init_embedding(gensim_w2v, corpus.current_dic)

    classifier = construct_classifier(corpus.current_dic.size,
                                      n_emb,
                                      n_hidden,
                                      corpus.n_target,
                                      pre_embedding=pre_embedding,
                                      use_hie=use_hie,
                                      nn_type=nn_type,
                                      pooling_type=pooling_type)
    #    classifier.load_state_dict(torch.load('model-res/model-gru-attention-False-Adadelta-1.0000-v1.ckpt'))
    classifier.to(device)

    # Loss and Optimizer
    loss_func = nn.NLLLoss()
    # It seems that Adadelta is better than Adagrad and Adam...
Пример #19
0
Файл: eni.py Проект: ormn96/DRNE
    def build_model(self):
        with tf.variable_scope('Placeholder'):
            self.nodes_placeholder = tf.placeholder(tf.int32, (None, ),
                                                    name='nodes_placeholder')
            self.seqlen_placeholder = tf.placeholder(tf.int32, (None, ),
                                                     name='seqlen_placeholder')
            self.neighborhood_placeholder = tf.placeholder(
                tf.int32, (None, self.args.sampling_size),
                name='neighborhood_placeholder')
            self.label_placeholder = tf.placeholder(tf.float32, (None, ),
                                                    name='label_placeholder')

        self.data = network.next_batch(self.graph,
                                       self.degree_max,
                                       sampling=True,
                                       sampling_size=self.args.sampling_size)

        with tf.variable_scope('Embeddings'):
            self.embeddings = tf.get_variable(
                'embeddings', [len(self.graph), self.args.embedding_size],
                initializer=tf.constant_initializer(
                    utils.init_embedding(self.degree, self.degree_max,
                                         self.args.embedding_size)))

        with tf.variable_scope('LSTM'):
            cell = tf.contrib.rnn.DropoutWrapper(
                #tf.contrib.rnn.BasicLSTMCell(num_units=self.args.embedding_size),
                tf.contrib.rnn.LayerNormBasicLSTMCell(
                    num_units=self.args.embedding_size, layer_norm=False),
                input_keep_prob=1.0,
                output_keep_prob=1.0)
            _, states = tf.nn.dynamic_rnn(
                cell,
                tf.nn.embedding_lookup(self.embeddings,
                                       self.neighborhood_placeholder),
                dtype=tf.float32,
                sequence_length=self.seqlen_placeholder)
            self.lstm_output = states.h

        with tf.variable_scope('Guilded'):
            self.predict_info = tf.squeeze(
                tf.layers.dense(self.lstm_output,
                                units=1,
                                activation=utils.selu))

        with tf.variable_scope('Loss'):
            self.structure_loss = tf.losses.mean_squared_error(
                tf.nn.embedding_lookup(self.embeddings,
                                       self.nodes_placeholder),
                self.lstm_output)
            self.guilded_loss = tf.reduce_mean(
                tf.abs(tf.subtract(self.predict_info, self.label_placeholder)))
            self.orth_loss = tf.losses.mean_squared_error(
                tf.matmul(self.embeddings, self.embeddings, transpose_a=True),
                tf.eye(self.args.embedding_size))
            self.total_loss = self.structure_loss + self.args.alpha * self.orth_loss + self.args.lamb * self.guilded_loss

        with tf.variable_scope('Optimizer'):
            #self.optimizer = tf.train.AdamOptimizer(self.args.learning_rate)
            self.optimizer = tf.train.RMSPropOptimizer(self.args.learning_rate)
            tvars = tf.trainable_variables()
            grads, self.global_norm = tf.clip_by_global_norm(
                tf.gradients(self.total_loss, tvars), self.args.grad_clip)
            self.train_op = self.optimizer.apply_gradients(zip(grads, tvars))

        with tf.variable_scope('Summary'):
            tf.summary.scalar("orth_loss", self.orth_loss)
            tf.summary.scalar("guilded_loss", self.guilded_loss)
            tf.summary.scalar("structure_loss", self.structure_loss)
            tf.summary.scalar("total_loss", self.total_loss)
            tf.summary.scalar("globol_norm", self.global_norm)
            for (grad, var) in zip(grads, tvars):
                if grad is not None:
                    tf.summary.histogram('grad/{}'.format(var.name), grad)
                    tf.summary.histogram('weight/{}'.format(var.name), var)

            log_dir = os.path.join(self.save_path, 'logs')
            if os.path.exists(log_dir):
                shutil.rmtree(log_dir)
            self.summary_writer = tf.summary.FileWriter(
                log_dir, self.sess.graph)

            config = projector.ProjectorConfig()
            embedding = config.embeddings.add()
            embedding.tensor_name = self.embeddings.name
            embedding.metadata_path = os.path.join(
                os.path.join(self.args.save_path, 'data', 'index.tsv'))
            projector.visualize_embeddings(self.summary_writer, config)

            self.merged_summary = tf.summary.merge_all()

        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
Пример #20
0
 def rand_init_embedding(self):
     """
     random initialize char-level embedding
     """
     utils.init_embedding(self.char_embeds.weight)
Пример #21
0
    )

    args = parser.parse_args()

    if args.account is None:
        print("Missing account,it must be provided.")
        sys.exit(1)
    else:
        # Todo:Check if account is valid,should create a collection to store all accounts?
        pass

    t0 = time.time()

    input_segment_file = INPUT_SEGMENT_FILE.format(name=args.account)
    word_voc_file = WORD_VOC_FILE.format(name=args.account)
    tag_voc_file = TAG_VOC_FILE.format(name=args.account)
    label_voc_file = LABEL_VOC_FILE.format(name=args.account)
    init_vocabulary(args.account, input_segment_file, word_voc_file,
                    tag_voc_file, label_voc_file, args.regen)

    embedding_root = EMBEDDING_PATH
    word2vec_file = WORD2VEC_FILE.format(name=args.account)
    word_embedding_file = WORD_EMBEDDING_FILE.format(name=args.account)
    tag_embedding_file = TAG_EMBEDDING_FILE.format(name=args.account)
    init_embedding(embedding_root, word2vec_file, word_embedding_file,
                   tag_embedding_file, word_voc_file, tag_voc_file, args.regen)

    demo(word_embedding_file)

    print('Done in %.1fs!' % (time.time() - t0))
Пример #22
0
def _train(args, pretrain_args):
    """Train the language model.

    Creates train/valid/test models, runs training epochs, saves model and
    writes results to database if specified.
    """
    start_time = time.time()
    print('Training', ', '.join(args.speakers), '...')

    # randomly sample validation set monte_carlo_cv_num times
    for num in range(args.monte_carlo_cv_num):
        # get seed used to sub-sample validation dataset (use 42 for 1st run)
        seed = utils.get_seed(num)

        # get train/valid/test data and convert to sequences
        train_data, valid_data, test_data, id_to_word = data_reader.get_data(
            args, seed=seed)
        # set configurations/hyperparameters for model
        config, test_config = utils.set_config(args, id_to_word)

        # initialize word embeddings
        init_embed = utils.init_embedding(id_to_word,
                                          dim=args.embed_size,
                                          init_scale=args.init_scale,
                                          embed_path=args.embed_path)

        with tf.Graph().as_default():
            # initializer used to initialize TensorFlow variables
            initializer = tf.random_uniform_initializer(
                -config['init_scale'], config['init_scale'])
            # create Train model
            with tf.name_scope('Train'):
                with tf.variable_scope('Model',
                                       reuse=None,
                                       initializer=initializer):
                    m_train = model.Model(args,
                                          is_training=True,
                                          config=config,
                                          init_embed=init_embed,
                                          name='Train')
                    m_train.build_graph()

            # create Valid model
            with tf.name_scope('Valid'):
                with tf.variable_scope('Model',
                                       reuse=True,
                                       initializer=initializer):
                    m_valid = model.Model(args,
                                          is_training=False,
                                          config=config,
                                          init_embed=init_embed,
                                          name='Valid')
                    m_valid.build_graph()

            # create Test model
            with tf.name_scope('Test'):
                with tf.variable_scope('Model',
                                       reuse=True,
                                       initializer=initializer):
                    m_test = model.Model(args,
                                         is_training=False,
                                         config=test_config,
                                         init_embed=init_embed,
                                         name='Test')
                    m_test.build_graph()

            # create summaries to be viewed in TensorBoard
            tb_summaries = utils.TensorBoardSummaries()
            tb_summaries.create_ops()

            init = tf.global_variables_initializer()

            # if pretrained, must create dict to initialize TF Saver
            if bool(pretrain_args):
                # get trainable variables and convert to dict for Saver
                reuse_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES)
                reuse_vars_dict = dict([(var.op.name, var)
                                        for var in reuse_vars])
                # create saver for TF session (see function for addl details)
                saver = utils.create_tf_saver(args, pretrain_args,
                                              reuse_vars_dict)
            else:
                saver = tf.train.Saver()

            # ppls dict has perplexities that are stored in results database
            ppls = {}
            ppls, _ = _update_ppls(ppls, initialize=True)

            with tf.Session() as sess:
                sess.run(init)

                if args.load_path != '':
                    print('Restoring model...')
                    saver.restore(sess, args.load_path)

                for epoch in range(config['max_epoch']):
                    print('Epoch: {0} Learning rate: {1:.3f}\n'.format(
                        epoch + 1, sess.run(m_train.lr)))
                    for i, speaker in enumerate(args.speakers):
                        print('Training {0} ...'.format(speaker))

                        # run epoch on training data
                        train_perplexity = _run_epoch(
                            sess,
                            m_train,
                            args,
                            train_data,
                            i,
                            tb_summaries,
                            id_to_word,
                            train_op=m_train.train_op,
                            verbose=True)
                        print('Epoch: {0} Train Perplexity: {1:.3f}'.format(
                            epoch + 1, train_perplexity))
                        ppls, _ = _update_ppls(ppls,
                                               epoch=epoch + 1,
                                               speaker=speaker,
                                               ppl=train_perplexity,
                                               dataset='train')

                        print('Validating...')
                        # run epoch on validation data
                        valid_perplexity = _run_epoch(sess,
                                                      m_valid,
                                                      args,
                                                      valid_data,
                                                      i,
                                                      tb_summaries,
                                                      id_to_word,
                                                      verbose=True)
                        print('Epoch: {0} Valid Perplexity: {1:.3f}'.format(
                            epoch + 1, valid_perplexity))
                        ppls, improved = _update_ppls(ppls,
                                                      epoch=epoch + 1,
                                                      speaker=speaker,
                                                      ppl=valid_perplexity,
                                                      dataset='valid')

                        if improved:
                            # save model if valid ppl is lower than current
                            # best valid ppl
                            if args.save_path != '':
                                print('Saving model to {0}.'.format(
                                    args.save_path))
                                saver.save(sess, args.save_path)

                for i, speaker in enumerate(args.speakers):
                    print('Testing {0} ...'.format(speaker))
                    print('Restoring best model for testing...')
                    saver.restore(sess, args.save_path)
                    # run model on test data
                    test_perplexity = _run_epoch(sess, m_test, args, test_data,
                                                 i)
                    ppls['test_ppl_' + speaker] = test_perplexity
                    print('Test Perplexity: {0:.3f}'.format(test_perplexity))

            if args.insert_db == 'True':
                # write params/config/results to sql database
                results_db.insert_results(args, config, start_time, ppls)