Exemplo n.º 1
0
def train():
    encoder_input_data, encoder_seq_len = load_encoder_inputs(
        OUTPUT_PATH / 'py_t_code_vecs_v2.npy')
    s_encoder_input_data, s_encoder_seq_len = load_encoder_inputs(
        OUTPUT_PATH / 'py_t_seq_vecs_v2.npy')
    decoder_input_data, decoder_target_data = load_decoder_inputs(
        OUTPUT_PATH / 'py_t_comment_vecs_v2.npy')
    num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_code_proc_v2.dpkl')
    s_num_encoder_tokens, s_enc_pp = load_text_processor(OUTPUT_PATH /
                                                         'py_seq_proc_v2.dpkl')
    num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_comment_proc_v2.dpkl')

    seq2seq_Model = build_seq2seq_model(
        word_emb_dim=128,
        hidden_state_dim=128,
        encoder_seq_len=encoder_seq_len,
        s_encoder_seq_len=s_encoder_seq_len,
        num_encoder_tokens=num_encoder_tokens,
        num_s_encoder_tokens=s_num_encoder_tokens,
        num_decoder_tokens=num_decoder_tokens)

    seq2seq_Model.summary()
    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0005),
                          loss='sparse_categorical_crossentropy')

    script_name_base = 'py_func_sum_v9_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint(
        '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(
            script_name_base),
        save_best_only=True)

    batch_size = 100
    epochs = 50
    history = seq2seq_Model.fit(
        [encoder_input_data, s_encoder_input_data, decoder_input_data],
        np.expand_dims(decoder_target_data, -1),
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.12,
        callbacks=[csv_logger, model_checkpoint])
    seq2seq_Model.save("seqmodel.hdf5")
Exemplo n.º 2
0
                    type=int,
                    default=get_value_as_int('TRAIN_EPOCHS', 7))
parser.add_argument("--batch_size",
                    type=int,
                    default=get_value_as_int('BATCH_SIZE', 1200))
parser.add_argument("--validation_split",
                    type=float,
                    default=get_value_as_float('BATCH_SIZE', 0.12))
args = parser.parse_args()
print(args)

learning_rate = float(args.learning_rate)

encoder_input_data, doc_length = load_encoder_inputs(
    args.input_train_body_vecs_npy)
decoder_input_data, decoder_target_data = load_decoder_inputs(
    args.input_train_title_vecs_npy)

num_encoder_tokens, body_pp = load_text_processor(
    args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(
    args.input_title_preprocessor_dpkl)

# Arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

###############
# Encoder Model.
###############
encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
Exemplo n.º 3
0
  def build_model(self, learning_rate):
    """Build a keras model."""
    logging.info("starting")

    if self.job_name and self.job_name.lower() in ["ps"]:
      logging.info("ps doesn't build model")
      return

    self.encoder_input_data, doc_length = load_encoder_inputs(
      self.preprocessed_bodies)
    self.decoder_input_data, self.decoder_target_data = load_decoder_inputs(
      self.preprocessed_titles)

    num_encoder_tokens, self.body_pp = load_text_processor(
      self.body_pp_file)
    num_decoder_tokens, self.title_pp = load_text_processor(
      self.title_pp_file)

    #arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ##### Define Model Architecture ######

    ########################
    #### Encoder Model ####
    encoder_inputs = keras.layers.Input(shape=(doc_length,), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = keras.layers.Embedding(
      num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
    x = keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    #  encode without decoding if we want to.

    encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ########################
    #### Decoder Model ####
    decoder_inputs = keras.layers.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = keras.layers.Embedding(
      num_decoder_tokens,
                latent_dim, name='Decoder-Word-Embedding',
                mask_zero=False)(decoder_inputs)
    dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # TODO(https://github.com/kubeflow/examples/issues/196):
    # With TF.Estimtor we hit https://github.com/keras-team/keras/issues/9761
    # and the model won't train.
    decoder_gru = keras.layers.GRU(
      latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')

    decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=[seq2seq_encoder_out])
    x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = keras.layers.Dense(
      num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ########################
    #### Seq2Seq Model ####

    self.seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    self.seq2seq_Model.compile(
      optimizer=keras.optimizers.Nadam(lr=learning_rate),
      loss='sparse_categorical_crossentropy',)
      #  TODO(jlewi): Computing accuracy causes a dimension mismatch.
      # tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [869] vs. [79,11] # pylint: disable=line-too-long
      # [[{{node metrics/acc/Equal}} = Equal[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](metrics/acc/Reshape, metrics/acc/Cast)]]  # pylint: disable=line-too-long
      # metrics=['accuracy'])

    self.seq2seq_Model.summary()
Exemplo n.º 4
0
        dpickle.dump(title_pp, f)

    # Save the processed data
    np.save(data_dir + 'train_title_vecs.npy', train_title_vecs)
    np.save(data_dir + 'train_body_vecs.npy', train_body_vecs)
else:
    time.sleep(120)

while True:
    if os.path.isfile(data_dir + 'train_body_vecs.npy'):
        break
    print("Waiting for dataset")
    time.sleep(2)
encoder_input_data, doc_length = load_encoder_inputs(data_dir +
                                                     'train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(
    data_dir + 'train_title_vecs.npy')

num_encoder_tokens, body_pp = load_text_processor(data_dir + 'body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor(data_dir + 'title_pp.dpkl')

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = tf.keras.layers.Input(shape=(doc_length, ),
                                       name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
Exemplo n.º 5
0
title_pkl_file = args.output_dir + '/title_pp.dpkl'
train_title_vecs_file = args.output_dir + '/train_title_vecs.npy'

# Save the preprocessor
with open(body_pkl_file, 'wb') as f:
    dpickle.dump(body_pp, f)

with open(title_pkl_file, 'wb') as f:
    dpickle.dump(title_pp, f)

# Save the processed data
np.save(train_title_vecs_file, train_title_vecs)
np.save(train_body_vecs_file, train_body_vecs)

encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs_file)
decoder_input_data, decoder_target_data = load_decoder_inputs(
    train_title_vecs_file)

num_encoder_tokens, body_pp = load_text_processor(body_pkl_file)
num_decoder_tokens, title_pp = load_text_processor(title_pkl_file)

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens,
Exemplo n.º 6
0
    # Save the processed data
    np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code)
    np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment)


# Arrange data for modeling

# In[5]:



from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc_v2.dpkl')


# If you don't have the above files on disk because you set `use_cache = True` you can download the files for the above function calls here:
# 
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_code_vecs_v2.npy
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_comment_vecs_v2.npy
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl

# # Build Seq2Seq Model For Summarizing Code
# 
# We will build a model to predict the docstring given a function or a method.  While this is a very cool task in itself, this is not the end goal of this exercise.  The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search.
Exemplo n.º 7
0
import numpy as np
if not use_cache:
    # Save the preprocessor
    with open(OUTPUT_PATH / 'py_code_proc_v2.dpkl', 'wb') as f:
        dpickle.dump(code_proc, f)
    with open(OUTPUT_PATH / 'py_comment_proc_v2.dpkl', 'wb') as f:
        dpickle.dump(comment_proc, f)
    #Save the processed data
    np.save(OUTPUT_PATH / 'py_t_code_vecs_v2.npy', t_code)
    np.save(OUTPUT_PATH / 'py_t_comment_vecs_v2.npy', t_comment)

# Arrange data for modeling
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor
encoder_input_data, encoder_seq_length = load_encoder_inputs(
    OUTPUT_PATH / 'py_t_code_vecs_v2.npy')
decoder_input_data, decoder_seq_length = load_decoder_inputs(
    OUTPUT_PATH / 'py_t_comment_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH /
                                                 'py_code_proc_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH /
                                                 'py_comment_proc_v2.dpkl')

# Build Seq2Seq Model for summarizing code
from seq2seq_utils import build_seq2seq_model
seq2seq_Model = build_seq2seq_model(word_emb_dim=800,
                                    hidden_state_dim=1000,
                                    encoder_seq_len=encoder_seq_length,
                                    num_encoder_tokens=num_encoder_tokens,
                                    num_decoder_tokens=num_decoder_tokens)
seq2seq_Model.summary()

# Train Seq2Seq Model