Python Decoding.beam_decoder 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.decoding_att_fsoftmax

클래스/타입: Decoding

메소드/함수: beam_decoder

hotexamples.com에서의 예제들: 3

Python Decoding.beam_decoder - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.decoding_att_fsoftmax.Decoding.beam_decoder에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Decoding(3)

beam_decoder(3)

자주 사용되는 메소드들

Decoding (3)

beam_decoder (3)

예제 #1

파일 보기

파일: decoder_inspec_att_fsoftmax_v2.py 프로젝트: hongtaowutj/Seq2Seq-Keyphrase-Generation

def decoder(params):

    data_path = params['data_path']
    glove_embed = params['glove_embedding']
    oov_embed = params['oov_embedding']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    result_kp20k = params['result_kp20k']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    batch_size = params['batch_size']
    epoch = params['epoch']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(
        preprocessed_v2, 'all_idxword_vocabulary_fsoftmax.pkl', data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(
        preprocessed_v2, 'all_wordidx_vocabulary_fsoftmax.pkl', data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    ## merge all set into one test set for trained model

    train_outputs_conn = DataConnector(data_path,
                                       'train_output_tokens.npy',
                                       data=None)
    train_outputs_conn.read_numpys()
    train_outputs = train_outputs_conn.read_file

    valid_outputs_conn = DataConnector(data_path,
                                       'val_output_tokens.npy',
                                       data=None)
    valid_outputs_conn.read_numpys()
    valid_outputs = valid_outputs_conn.read_file

    test_outputs_conn = DataConnector(data_path,
                                      'test_output_tokens.npy',
                                      data=None)
    test_outputs_conn.read_numpys()
    test_outputs = test_outputs_conn.read_file

    y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs))

    print("Ground truth of keyphrases shape: %s" %
          str(y_test_true.shape))  # input for encoder
    sys.stdout.flush()

    # non-paired data set

    X_train_connector = DataConnector(preprocessed_data,
                                      'X_train_pad_fsoftmax.npy',
                                      data=None)
    X_train_connector.read_numpys()
    X_train = X_train_connector.read_file

    X_valid_connector = DataConnector(preprocessed_data,
                                      'X_valid_pad_fsoftmax.npy',
                                      data=None)
    X_valid_connector.read_numpys()
    X_valid = X_valid_connector.read_file

    X_test_connector = DataConnector(preprocessed_data,
                                     'X_test_pad_fsoftmax.npy',
                                     data=None)
    X_test_connector.read_numpys()
    X_test = X_test_connector.read_file

    X_in = np.concatenate((X_train, X_valid, X_test))

    glove_embedding_conn = DataConnector(preprocessed_v2,
                                         glove_embed,
                                         data=None)
    glove_embedding_conn.read_pickle()
    pretrained_embedding = glove_embedding_conn.read_file

    print("pretrained_embedding shape: %s" % str(pretrained_embedding.shape))
    print("pretrained_embedding [0][:10]: %s" %
          str(pretrained_embedding[0, :10]))
    print("pretrained_embedding [1][:10]: %s" %
          str(pretrained_embedding[1, :10]))

    oov_embedding_conn = DataConnector(preprocessed_v2, oov_embed, data=None)
    oov_embedding_conn.read_pickle()
    oov_embedding = oov_embedding_conn.read_file

    print("oov_embedding shape: %s" % str(oov_embedding.shape))
    print("oov_embedding [0][:10]: %s" % str(oov_embedding[0, :10]))
    print("oov_embedding [1][:10]: %s" % str(oov_embedding[1, :10]))
    print("oov_embedding [2][:10]: %s" % str(oov_embedding[2, :10]))

    full_softmax = AttentionFullSoftmax(encoder_length=encoder_length,
                                        decoder_length=decoder_length,
                                        embedding_dim=embedding_dim,
                                        birnn_dim=birnn_dim,
                                        rnn_dim=rnn_dim,
                                        vocab_size=vocab_size,
                                        filepath=result_kp20k,
                                        filename=file_name,
                                        batch_train_iter=None,
                                        batch_val_iter=None,
                                        batch_size=None,
                                        steps_epoch=None,
                                        val_steps=None,
                                        epochs=None)

    full_softmax.train_att_seq2seq(pretrained_embedding, oov_embedding)

    full_softmax.predict_att_seq2seq(weights)
    encoder_model = full_softmax.encoder_model
    # 1. Prediction model after being trained on sampled softmax setting
    predict_softmax_model = full_softmax.prediction_model
    '''
	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    decoder_model = full_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    num_hypotheses = beam_width
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()

    s0_test = np.zeros((len(X_in), rnn_dim))
    att0_test = np.zeros((len(X_in), encoder_length, 1))

    print(str(datetime.now()))
    sys.stdout.flush()

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              states=None,
                              attentions=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_in[:500], s0_test[:500],
                                                  att0_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()

예제 #2

파일 보기

def decoder(params):

    data_path = params['data_path']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(preprocessed_v2,
                                            'all_indices_words_fsoftmax.pkl',
                                            data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(preprocessed_v2,
                                            'all_words_indices_fsoftmax.pkl',
                                            data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    y_test_true_connector = DataConnector(data_path,
                                          'test_output_tokens.npy',
                                          data=None)
    y_test_true_connector.read_numpys()
    y_test_true = y_test_true_connector.read_file

    # paired data set

    X_pair_test_connector = DataConnector(preprocessed_data,
                                          'x_pair_test_fsoftmax.npy',
                                          data=None)
    X_pair_test_connector.read_numpys()
    X_pair_test = X_pair_test_connector.read_file

    y_pair_test_in_connector = DataConnector(preprocessed_data,
                                             'y_pair_test_in_fsoftmax.npy',
                                             data=None)
    y_pair_test_in_connector.read_numpys()
    y_pair_test_in = y_pair_test_in_connector.read_file

    y_pair_test_out_connector = DataConnector(preprocessed_data,
                                              'y_pair_test_out_fsoftmax.npy',
                                              data=None)
    y_pair_test_out_connector.read_numpys()
    y_pair_test_out = y_pair_test_out_connector.read_file

    # non-paired data set

    X_test_connector = DataConnector(preprocessed_data,
                                     'X_test_pad_fsoftmax.npy',
                                     data=None)
    X_test_connector.read_numpys()
    X_test = X_test_connector.read_file

    y_test_in_connector = DataConnector(preprocessed_data,
                                        'y_test_in_fsoftmax.npy',
                                        data=None)
    y_test_in_connector.read_numpys()
    y_test_in = y_test_in_connector.read_file

    y_test_out_connector = DataConnector(preprocessed_data,
                                         'y_test_out_fsoftmax.npy',
                                         data=None)
    y_test_out_connector.read_numpys()
    y_test_out = y_test_out_connector.read_file

    print("\n Non-paired test set: \n")
    sys.stdout.flush()
    print("X (input for encoder) shape: %s" %
          str(X_test.shape))  # input for encoder
    sys.stdout.flush()
    print("y_in (input for decoder) shape: %s" %
          str(y_test_in.shape))  # input for decoder
    sys.stdout.flush()
    print("y_out (output for decoder) shape: %s\n\n" %
          str(y_test_out.shape))  # output for decoder
    sys.stdout.flush()
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    full_softmax = AttentionFullSoftmax(encoder_length=encoder_length,
                                        decoder_length=decoder_length,
                                        embedding_dim=embedding_dim,
                                        birnn_dim=birnn_dim,
                                        rnn_dim=rnn_dim,
                                        vocab_size=vocab_size,
                                        filepath=result_path,
                                        filename=file_name,
                                        batch_train_iter=None,
                                        batch_val_iter=None,
                                        batch_size=None,
                                        steps_epoch=None,
                                        val_steps=None,
                                        epochs=None)

    # skeleton of model architecture
    full_softmax.train_att_seq2seq()

    predict_softmax_model = full_softmax.predict_att_seq2seq(weights)
    encoder_model = full_softmax.encoder_model
    '''

	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''

    decoder_model = full_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()
    num_hypotheses = beam_width

    s0_test = np.zeros((len(X_test), rnn_dim))
    att0_test = np.zeros((len(X_test), encoder_length, 1))

    print(str(datetime.now()))
    sys.stdout.flush()

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              states=None,
                              attentions=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_test[:500], s0_test[:500],
                                                  att0_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()

예제 #3

파일 보기

def decoder(params):

    data_path = params['data_path']
    kp20k_path = params['kp20k_path']
    preprocessed_v2 = params['preprocessed_v2']
    preprocessed_data = params['preprocessed_data']
    decode_path = params['decode_path']
    model_path = params['model_path']
    result_path = params['result_path']
    result_kp20k = params['result_kp20k']
    file_name = params['file_name']
    weights = params['weights']

    encoder_length = params['encoder_length']
    decoder_length = params['decoder_length']
    max_sents = params['max_sents']
    embedding_dim = params['embedding_dim']
    birnn_dim = params['birnn_dim']
    rnn_dim = params['rnn_dim']
    vocab_size = params['vocab_size']
    batch_size = params['batch_size']
    epoch = params['epoch']
    '''
	Reading vocabulary dictionaries

	'''
    indices_words_connector = DataConnector(
        preprocessed_v2, 'all_indices_words_sent_fsoftmax.pkl', data=None)
    indices_words_connector.read_pickle()
    indices_words = indices_words_connector.read_file

    words_indices_connector = DataConnector(
        preprocessed_v2, 'all_words_indices_sent_fsoftmax.pkl', data=None)
    words_indices_connector.read_pickle()
    words_indices = words_indices_connector.read_file

    ## merge all set into one test set for trained model

    train_outputs_conn = DataConnector(data_path,
                                       'train_output_sent_tokens.npy',
                                       data=None)
    train_outputs_conn.read_numpys()
    train_outputs = train_outputs_conn.read_file

    valid_outputs_conn = DataConnector(data_path,
                                       'val_output_sent_tokens.npy',
                                       data=None)
    valid_outputs_conn.read_numpys()
    valid_outputs = valid_outputs_conn.read_file

    test_outputs_conn = DataConnector(data_path,
                                      'test_output_sent_tokens.npy',
                                      data=None)
    test_outputs_conn.read_numpys()
    test_outputs = test_outputs_conn.read_file

    y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs))

    print("Ground truth of keyphrases shape: %s" %
          str(y_test_true.shape))  # input for encoder
    sys.stdout.flush()

    # non-paired data set

    X_train_connector = DataConnector(preprocessed_data,
                                      'X_train_pad_sent_fsoftmax.npy',
                                      data=None)
    X_train_connector.read_numpys()
    X_train = X_train_connector.read_file

    X_valid_connector = DataConnector(preprocessed_data,
                                      'X_valid_pad_sent_fsoftmax.npy',
                                      data=None)
    X_valid_connector.read_numpys()
    X_valid = X_valid_connector.read_file

    X_test_connector = DataConnector(preprocessed_data,
                                     'X_test_pad_sent_fsoftmax.npy',
                                     data=None)
    X_test_connector.read_numpys()
    X_test = X_test_connector.read_file

    X_in = np.concatenate((X_train, X_valid, X_test))

    print("\n Non-paired test set: \n")
    sys.stdout.flush()
    print("X (input for encoder) shape: %s" %
          str(X_in.shape))  # input for encoder
    sys.stdout.flush()
    '''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

    full_softmax = HierarchyAttFullSoftmax(encoder_length=encoder_length,
                                           decoder_length=decoder_length,
                                           max_sents=max_sents,
                                           embedding_dim=embedding_dim,
                                           birnn_dim=birnn_dim,
                                           rnn_dim=rnn_dim,
                                           vocab_size=vocab_size,
                                           filepath=result_kp20k,
                                           filename=file_name,
                                           batch_train_iter=None,
                                           batch_val_iter=None,
                                           batch_size=None,
                                           steps_epoch=None,
                                           val_steps=None,
                                           epochs=None)

    # skeleton of model architecture
    full_softmax.train_hier_att_seq2seq()

    full_softmax.predict_hier_att(weights)
    encoder_model = full_softmax.encoder_model
    # 1. Prediction model after being trained on sampled softmax setting
    predict_softmax_model = full_softmax.prediction_model

    decoder_model = full_softmax.create_decoder_model()

    # transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
    keyphrases_transform = TrueKeyphrases(y_test_true)
    keyphrases_transform.get_true_keyphrases()
    keyphrases_transform.get_stat_keyphrases()
    y_true = keyphrases_transform.y_true
    max_kp_num = keyphrases_transform.max_kp_num
    mean_kp_num = keyphrases_transform.mean_kp_num
    std_kp_num = keyphrases_transform.std_kp_num

    print("Maximum number of key phrases per document in corpus: %s" %
          max_kp_num)
    sys.stdout.flush()
    print("Average number of key phrases per document in corpus: %s" %
          mean_kp_num)
    sys.stdout.flush()
    print(
        "Standard Deviation of number of key phrases per document in corpus: %s"
        % std_kp_num)
    sys.stdout.flush()

    # round up function for computing beam width
    def roundup(x):
        return x if x % 5 == 0 else x + 5 - x % 5

    beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
    num_hypotheses = beam_width
    print("\nBeam width: %s\n" % beam_width)
    sys.stdout.flush()

    s0_test = np.zeros((len(X_in), rnn_dim))
    att0_test = np.zeros((len(X_in), encoder_length, 1))

    print(str(datetime.now()))
    sys.stdout.flush()

    inference_mode = Decoding(encoder_model=encoder_model,
                              decoder_model=decoder_model,
                              indices_words=indices_words,
                              words_indices=words_indices,
                              enc_in_seq=None,
                              states=None,
                              attentions=None,
                              decoder_length=decoder_length,
                              rnn_dim=rnn_dim,
                              beam_width=beam_width,
                              num_hypotheses=num_hypotheses,
                              filepath=decode_path,
                              filename=file_name)

    t0_1 = time.time()
    print("Start beam decoding...")
    sys.stdout.flush()

    beam_keyphrases = inference_mode.beam_decoder(X_in[:500], s0_test[:500],
                                                  att0_test[:500])

    beam_decode_connector = DataConnector(decode_path,
                                          'beam_kp-hier-%s.npy' % (file_name),
                                          beam_keyphrases)
    beam_decode_connector.save_numpys()

    t1_1 = time.time()
    print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
    sys.stdout.flush()