Exemplo n.º 1
0
def predict():
    train_code, holdout_code, train_comment, holdout_comment = read_training_files(
        '../../data/processed_data/')
    loc = "/home/bohong/文档/mygit/cdpensearch/cdpensearch/oneEncoder/seqmodel.hdf5"
    seq2seq_Model = load_model(loc)

    loc = OUTPUT_PATH / 'py_code_proc_v2.dpkl'
    num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_code_proc_v2.dpkl')
    num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_comment_proc_v2.dpkl')
    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                    decoder_preprocessor=dec_pp,
                                    seq2seq_model=seq2seq_Model)
    demo_testdf = pd.DataFrame({
        'code': holdout_code,
        'comment': holdout_comment,
        'ref': ''
    })
    # seq2seq_inf.predications(df=demo_testdf)
    f = open("generatetag.txt")
    score = seq2seq_inf.evaluate_model(f.readlines(),
                                       holdout_comment,
                                       max_len=None)
    f.close()
    print(score)
 def load_seq2seq_model(self):
     K.clear_session()
     seq2seq_Model = load_model(
         str(self.seq2seq_path / 'code_summary_seq2seq_model.h5'))
     num_encoder_tokens, enc_pp = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
     num_decoder_tokens, dec_pp = load_text_processor(
         self.seq2seq_path / 'py_comment_proc_v2.dpkl')
     self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                          decoder_preprocessor=dec_pp,
                                          seq2seq_model=seq2seq_Model)
Exemplo n.º 3
0
def detect(inputs, input_model_h5, input_title_preprocessor_dpkl,
           input_body_preprocessor_dpkl):
    # Load model, preprocessors.
    seq2seq_Model = keras.models.load_model(input_model_h5)
    num_encoder_tokens, body_pp = load_text_processor(
        input_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        input_title_preprocessor_dpkl)

    # Prepare inference.
    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                    decoder_preprocessor=title_pp,
                                    seq2seq_model=seq2seq_Model)

    # Output predictions for n random rows in the test set.
    return seq2seq_inf.generate_issue_title(input[0])
    def create_vector(self, postgres, file_id):
        K.clear_session()
        print("Going to load code2emb_model")
        self.code2emb_model = load_model(str(self.code2emb_path /
                                             'code2emb_model.hdf5'),
                                         custom_objects=None,
                                         compile=False)
        print("Going to load_text_processor")
        self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor(
            self.seq2seq_path / 'py_code_proc_v2.dpkl')
        # with open(self.data_path/'without_docstrings.function', 'r', encoding='utf-8') as f:
        #     no_docstring_funcs = f.readlines()
        paras, paraids, autotags, manualtags = postgres.get_paragraphs_fileid(
            file_id)
        paras = [str(item) for item in paras]
        no_docstring_funcs = paras
        print("no_docstring_funcs = ", no_docstring_funcs)
        print("Going to transform_parallel")
        # encinp = self.enc_pp_vector.transform_parallel(no_docstring_funcs)
        encinp = self.enc_pp_vector.transform(no_docstring_funcs)
        # np.save(self.code2emb_path/'nodoc_encinp.npy', encinp)
        # encinp = np.load(self.code2emb_path/'nodoc_encinp.npy')
        print("Going to create the vector")
        nodoc_vecs = self.code2emb_model.predict(encinp, batch_size=200)
        # make sure the number of output rows equal the number of input rows
        assert nodoc_vecs.shape[0] == encinp.shape[0]
        # np.save(self.code2emb_path/'nodoc_vecs.npy', nodoc_vecs)
        npy_filename = str(file_id) + "####" + "nodoc_vecs.npy"
        np.save(self.npy_path / npy_filename, nodoc_vecs)

        K.clear_session()
        print("Vector is created")
 def load_code2emb_model(self):
     K.clear_session()
     self.code2emb_model = load_model(str(self.code2emb_path /
                                          'code2emb_model.hdf5'),
                                      custom_objects=None,
                                      compile=False)
     self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
Exemplo n.º 6
0
def train():
    encoder_input_data, encoder_seq_len = load_encoder_inputs(
        OUTPUT_PATH / 'py_t_code_vecs_v2.npy')
    s_encoder_input_data, s_encoder_seq_len = load_encoder_inputs(
        OUTPUT_PATH / 'py_t_seq_vecs_v2.npy')
    decoder_input_data, decoder_target_data = load_decoder_inputs(
        OUTPUT_PATH / 'py_t_comment_vecs_v2.npy')
    num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_code_proc_v2.dpkl')
    s_num_encoder_tokens, s_enc_pp = load_text_processor(OUTPUT_PATH /
                                                         'py_seq_proc_v2.dpkl')
    num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_comment_proc_v2.dpkl')

    seq2seq_Model = build_seq2seq_model(
        word_emb_dim=128,
        hidden_state_dim=128,
        encoder_seq_len=encoder_seq_len,
        s_encoder_seq_len=s_encoder_seq_len,
        num_encoder_tokens=num_encoder_tokens,
        num_s_encoder_tokens=s_num_encoder_tokens,
        num_decoder_tokens=num_decoder_tokens)

    seq2seq_Model.summary()
    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0005),
                          loss='sparse_categorical_crossentropy')

    script_name_base = 'py_func_sum_v9_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint(
        '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(
            script_name_base),
        save_best_only=True)

    batch_size = 100
    epochs = 50
    history = seq2seq_Model.fit(
        [encoder_input_data, s_encoder_input_data, decoder_input_data],
        np.expand_dims(decoder_target_data, -1),
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.12,
        callbacks=[csv_logger, model_checkpoint])
    seq2seq_Model.save("seqmodel.hdf5")
Exemplo n.º 7
0
def load_summarizer(seq2seq_model_path, text_processor_path):
    """
    Loads the code summarizer model and returns the interference object
    to be used for predicting docstrings.

    Input: -----

    Returns: Seq2Seq_Inference object

    Author: Tyler Medlin

    """
    #the code from the GitHub team has a LOT of soon to be depricated functions
    #suppress the depricated warnings
    tf.logging.set_verbosity('ERROR')
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
    warnings.filterwarnings("ignore")

    logging.warning('Loading pre-trained model...')
    # Load model
    seq2seq_Model = load_model(seq2seq_model_path +
                               '/py_func_sum_v9_.epoch16-val2.55276.hdf5')

    logging.warning('Loading text processor (encoder)...')
    # Load encoder (code) pre-processor
    num_encoder_tokens, enc_pp = load_text_processor(text_processor_path +
                                                     '/py_code_proc_v2.dpkl')

    logging.warning('Loading text processor (decoder)...')
    # Load decoder (docstrings/comments) pre-processor
    num_decoder_tokens, dec_pp = load_text_processor(
        text_processor_path + '/py_comment_proc_v2.dpkl')

    graph = tf.get_default_graph()

    seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                    decoder_preprocessor=dec_pp,
                                    seq2seq_model=seq2seq_Model)

    return seq2seq_inf, graph
 def create_autotag(self, postgres, file_id):
     K.clear_session()
     seq2seq_Model = load_model(
         str(self.seq2seq_path / 'code_summary_seq2seq_model.h5'))
     num_encoder_tokens, enc_pp = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
     num_decoder_tokens, dec_pp = load_text_processor(
         self.seq2seq_path / 'py_comment_proc_v2.dpkl')
     self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                          decoder_preprocessor=dec_pp,
                                          seq2seq_model=seq2seq_Model)
     paras, paraids, autotags, manualtags = postgres.get_paragraphs_fileid(
         file_id)
     paras = [str(item) for item in paras]
     no_docstring_funcs = paras
     no_docstring_paraids = paraids
     print("no_docstring_paraids = ", no_docstring_paraids)
     print("size of paragraphs = ", len(no_docstring_funcs))
     print("size of paraids = ", len(no_docstring_paraids))
     demo_testdf = pd.DataFrame({
         'code': no_docstring_funcs,
         'comment': '',
         'ref': ''
     })
     auto_tag = self.seq2seq_inf.demo_model_predictions(n=15,
                                                        df=demo_testdf)
     print("size of auto_tag = ", len(auto_tag))
     with open(self.data_path / 'without_docstrings.autotag',
               'w',
               encoding='utf-8') as f:
         index = 0
         for item in auto_tag:
             f.write("%s\n" % item)
             paraid = no_docstring_paraids[index]
             # paraid = paraid.strip()
             updated_rows = postgres.update_autotag(paraid, item)
             index = index + 1
     K.clear_session()
 def load_models(self):
     K.clear_session()
     print("Going to load 'code_summary_seq2seq_model.h5'")
     seq2seq_Model = load_model(
         str(self.seq2seq_path / 'code_summary_seq2seq_model.h5'))
     print("Going to load 'py_code_proc_v2.dpkl'")
     num_encoder_tokens, enc_pp = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
     print("Going to load 'py_comment_proc_v2.dpkl'")
     num_decoder_tokens, dec_pp = load_text_processor(
         self.seq2seq_path / 'py_comment_proc_v2.dpkl')
     print("Going to load 'Seq2Seq_Inference'")
     self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                          decoder_preprocessor=dec_pp,
                                          seq2seq_model=seq2seq_Model)
     print("Going to load 'code2emb_model.hdf5'")
     self.code2emb_model = load_model(str(self.code2emb_path /
                                          'code2emb_model.hdf5'),
                                      custom_objects=None,
                                      compile=False)
     print("Going to load 'py_code_proc_v2.dpkl'")
     self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor(
         self.seq2seq_path / 'py_code_proc_v2.dpkl')
Exemplo n.º 10
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument("--input_data_gcs_bucket", type=str, default="")
    parser.add_argument("--input_data_gcs_path", type=str, default="")

    parser.add_argument("--output_model_gcs_bucket", type=str, default="")
    parser.add_argument("--output_model_gcs_path", type=str, default="")

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")
    parser.add_argument("--output_model_h5",
                        type=str,
                        default="output_model.h5")

    args = parser.parse_args()
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)
    print("Download iput file")
    if args.input_data_gcs_bucket != "" and args.input_data_gcs_path != "":
        bucket = storage.Bucket(storage.Client(), args.input_data_gcs_bucket)
        storage.Blob(args.input_data_gcs_path,
                     bucket).download_to_filename('github-issues.zip')
    else:
        urllib.request.urlretrieve(
            "https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip",
            'github-issues.zip')

    print("unzip iput file")
    zip_ref = zipfile.ZipFile('github-issues.zip', 'r')
    zip_ref.extractall('.')
    zip_ref.close()

    # Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv('github_issues.csv').sample(n=args.sample_size),
        test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    ###############
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    ################
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.
    ################

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    #############
    # Save model.
    #############
    seq2seq_Model.save(args.output_model_h5)

    ######################
    # Upload model to GCS.
    ######################
    if args.output_model_gcs_bucket != "":
        bucket = storage.Bucket(storage.Client(), args.output_model_gcs_bucket)
        storage.Blob(args.output_model_gcs_path,
                     bucket).upload_from_filename(args.output_model_h5)
Exemplo n.º 11
0
  def build_model(self, learning_rate):
    """Build a keras model."""
    logging.info("starting")

    if self.job_name and self.job_name.lower() in ["ps"]:
      logging.info("ps doesn't build model")
      return

    self.encoder_input_data, doc_length = load_encoder_inputs(
      self.preprocessed_bodies)
    self.decoder_input_data, self.decoder_target_data = load_decoder_inputs(
      self.preprocessed_titles)

    num_encoder_tokens, self.body_pp = load_text_processor(
      self.body_pp_file)
    num_decoder_tokens, self.title_pp = load_text_processor(
      self.title_pp_file)

    #arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ##### Define Model Architecture ######

    ########################
    #### Encoder Model ####
    encoder_inputs = keras.layers.Input(shape=(doc_length,), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = keras.layers.Embedding(
      num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
    x = keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    #  encode without decoding if we want to.

    encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ########################
    #### Decoder Model ####
    decoder_inputs = keras.layers.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = keras.layers.Embedding(
      num_decoder_tokens,
                latent_dim, name='Decoder-Word-Embedding',
                mask_zero=False)(decoder_inputs)
    dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # TODO(https://github.com/kubeflow/examples/issues/196):
    # With TF.Estimtor we hit https://github.com/keras-team/keras/issues/9761
    # and the model won't train.
    decoder_gru = keras.layers.GRU(
      latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')

    decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=[seq2seq_encoder_out])
    x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = keras.layers.Dense(
      num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ########################
    #### Seq2Seq Model ####

    self.seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    self.seq2seq_Model.compile(
      optimizer=keras.optimizers.Nadam(lr=learning_rate),
      loss='sparse_categorical_crossentropy',)
      #  TODO(jlewi): Computing accuracy causes a dimension mismatch.
      # tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [869] vs. [79,11] # pylint: disable=line-too-long
      # [[{{node metrics/acc/Equal}} = Equal[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](metrics/acc/Reshape, metrics/acc/Cast)]]  # pylint: disable=line-too-long
      # metrics=['accuracy'])

    self.seq2seq_Model.summary()
Exemplo n.º 12
0
# We want to vectorize all of the code without docstrings so we can test the efficacy of the search on the code that was never seen by the model.

# In[9]:

from keras.models import load_model
from pathlib import Path
import numpy as np
from seq2seq_utils import load_text_processor
code2emb_path = Path('./data/code2emb/')
seq2seq_path = Path('./data/seq2seq/')
data_path = Path('./data/processed_data/')

# In[10]:

code2emb_model = load_model(code2emb_path / 'code2emb_model.hdf5')
num_encoder_tokens, enc_pp = load_text_processor(seq2seq_path /
                                                 'py_code_proc_v2.dpkl')

with open(data_path / 'without_docstrings.function', 'r') as f:
    no_docstring_funcs = f.readlines()

# ### Pre-process code without docstrings for input into `code2emb` model
#
# We use the same transformer we used to train the original model.

# In[13]:

# tokenized functions that did not contain docstrigns
no_docstring_funcs[:5]

# In[11]:
Exemplo n.º 13
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument("--input_data",
                        type=str,
                        default="",
                        help="The input location, a local file path.")

    parser.add_argument(
        "--output_model",
        type=str,
        default="",
        help="The output location for the model, a local file path.")

    #####################################################
    #  Optional section, based on what your model needs
    #####################################################

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")

    ########################################################
    #  End of optional args section
    #
    #  Be sure to add your args at the appropriate sections
    #  of the training code
    ########################################################

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)

    ##################################################
    #  Reading input file(s)
    #  Make changes as needed
    ##################################################

    # Reading input data file
    ext = os.path.splitext(args.input_data)[-1]
    if ext.lower() == '.zip':
        zip_ref = zipfile.ZipFile(args.input_data, 'r')
        zip_ref.extractall('.')
        zip_ref.close()
        # TODO(jlewi): Hardcoding the file in the Archive to use is brittle.
        # We should probably just require the input to be a CSV file.
        csv_file = 'github_issues.csv'
    else:
        csv_file = args.input_data


###################################################
#  Fill in your model training code starting here
###################################################

# Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    ########################################################
    #  End of your training code
    #
    #  * Be sure to save your model to args.output_model
    #     such as Model.save(args.output_model)
    ########################################################

    # Save model.
    seq2seq_Model.save(args.output_model)
Exemplo n.º 14
0
    np.save(data_dir + 'train_title_vecs.npy', train_title_vecs)
    np.save(data_dir + 'train_body_vecs.npy', train_body_vecs)
else:
    time.sleep(120)

while True:
    if os.path.isfile(data_dir + 'train_body_vecs.npy'):
        break
    print("Waiting for dataset")
    time.sleep(2)
encoder_input_data, doc_length = load_encoder_inputs(data_dir +
                                                     'train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(
    data_dir + 'train_title_vecs.npy')

num_encoder_tokens, body_pp = load_text_processor(data_dir + 'body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor(data_dir + 'title_pp.dpkl')

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = tf.keras.layers.Input(shape=(doc_length, ),
                                       name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = tf.keras.layers.Embedding(num_encoder_tokens,
                              latent_dim,
Exemplo n.º 15
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument(
        "--input_data",
        type=str,
        default="",
        help="The input location. Can be a GCS or local file path.")

    # TODO(jlewi): The following arguments are deprecated; just
    # use input_data. We should remove them as soon as all call sites
    # are updated.
    parser.add_argument("--input_data_gcs_bucket",
                        type=str,
                        default="kubeflow-examples")
    parser.add_argument(
        "--input_data_gcs_path",
        type=str,
        default="github-issue-summarization-data/github-issues.zip")

    parser.add_argument(
        "--output_model",
        type=str,
        default="",
        help="The output location for the model GCS or local file path.")

    # TODO(jlewi): We should get rid of the following arguments and just use
    # --output_model_h5. If the output is a gs:// location we should use
    # a local file and then upload it to GCS.
    parser.add_argument("--output_model_gcs_bucket", type=str, default="")
    parser.add_argument(
        "--output_model_gcs_path",
        type=str,
        default="github-issue-summarization-data/output_model.h5")

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")
    parser.add_argument("--output_model_h5",
                        type=str,
                        default="output_model.h5")

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)

    # For backwords compatibility
    input_data_gcs_bucket = None
    input_data_gcs_path = None

    if not args.input_data:
        # Since input_data isn't set fall back on old arguments.
        input_data_gcs_bucket = args.input_data_gcs_bucket
        input_data_gcs_path = args.input_data_gcs_path
    else:
        if args.input_data.startswith('gs://'):
            input_data_gcs_bucket, input_data_gcs_path = split_gcs_uri(
                args.input_data)

    if input_data_gcs_bucket:
        logging.info("Download bucket %s object %s.", input_data_gcs_bucket,
                     input_data_gcs_path)
        bucket = storage.Bucket(storage.Client(), input_data_gcs_bucket)
        args.input_data = 'github-issues.zip'
        storage.Blob(input_data_gcs_path,
                     bucket).download_to_filename(args.input_data)

    ext = os.path.splitext(args.input_data)[-1]
    if ext.lower() == '.zip':
        zip_ref = zipfile.ZipFile(args.input_data, 'r')
        zip_ref.extractall('.')
        zip_ref.close()
        # TODO(jlewi): Hardcoding the file in the Archive to use is brittle.
        # We should probably just require the input to be a CSV file.
        csv_file = 'github_issues.csv'
    else:
        csv_file = args.input_data

    # Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    ###############
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    ################
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.
    ################

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    #############
    # Save model.
    #############
    seq2seq_Model.save(args.output_model_h5)

    ######################
    # Upload model to GCS.
    ######################
    # For backwords compatibility
    output_model_gcs_bucket = None
    output_model_gcs_path = None

    if not args.output_model:
        # Since input_data isn't set fall back on old arguments.
        output_model_gcs_bucket = args.output_model_gcs_bucket
        output_model_gcs_path = args.output_model_gcs_path
    else:
        if args.output_model.startswith('gs://'):
            output_model_gcs_bucket, output_model_gcs_path = split_gcs_uri(
                args.output_model)

    if output_model_gcs_bucket:
        logging.info("Uploading model to bucket %s path %s.",
                     output_model_gcs_bucket, output_model_gcs_path)
        bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket)
        storage.Blob(output_model_gcs_path,
                     bucket).upload_from_filename(args.output_model_h5)
Exemplo n.º 16
0
                    type=int,
                    default=get_value_as_int('BATCH_SIZE', 1200))
parser.add_argument("--validation_split",
                    type=float,
                    default=get_value_as_float('BATCH_SIZE', 0.12))
args = parser.parse_args()
print(args)

learning_rate = float(args.learning_rate)

encoder_input_data, doc_length = load_encoder_inputs(
    args.input_train_body_vecs_npy)
decoder_input_data, decoder_target_data = load_decoder_inputs(
    args.input_train_title_vecs_npy)

num_encoder_tokens, body_pp = load_text_processor(
    args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(
    args.input_title_preprocessor_dpkl)

# Arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

###############
# Encoder Model.
###############
encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens,
              latent_dim,
              name='Body-Word-Embedding',
Exemplo n.º 17
0
    np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code)
    np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment)


# Arrange data for modeling

# In[5]:



from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc_v2.dpkl')


# If you don't have the above files on disk because you set `use_cache = True` you can download the files for the above function calls here:
# 
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_code_vecs_v2.npy
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_comment_vecs_v2.npy
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl

# # Build Seq2Seq Model For Summarizing Code
# 
# We will build a model to predict the docstring given a function or a method.  While this is a very cool task in itself, this is not the end goal of this exercise.  The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search.

# In[6]:
Exemplo n.º 18
0
# Save the preprocessor
with open(body_pkl_file, 'wb') as f:
    dpickle.dump(body_pp, f)

with open(title_pkl_file, 'wb') as f:
    dpickle.dump(title_pp, f)

# Save the processed data
np.save(train_title_vecs_file, train_title_vecs)
np.save(train_body_vecs_file, train_body_vecs)

encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs_file)
decoder_input_data, decoder_target_data = load_decoder_inputs(
    train_title_vecs_file)

num_encoder_tokens, body_pp = load_text_processor(body_pkl_file)
num_decoder_tokens, title_pp = load_text_processor(title_pkl_file)

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens,
              latent_dim,
              name='Body-Word-Embedding',
Exemplo n.º 19
0
else:
    filename = 'data/seq2seq/code_summary_seq2seq_model.h5'



seq2seq_Model = load_model(filename)


loc = ""
# Load encoder (code) pre-processor from url
if(args['download']):
    loc = get_file(fname='py_code_proc_v2.dpkl',
               origin='https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl')
else:
    loc="data/seq2seq/py_code_proc_v2.dpkl"
num_encoder_tokens, enc_pp = load_text_processor(loc)

loc = ""
# Load encoder (code) pre-processor from url
if(args['download']):
# Load decoder (docstrings/comments) pre-processor from url
    loc = get_file(fname='py_comment_proc_v2.dpkl',
               origin='https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl')

else:
    loc="data/seq2seq/py_comment_proc_v2.dpkl"
num_decoder_tokens, dec_pp = load_text_processor(loc)

from seq2seq_utils import Seq2Seq_Inference
import pandas as pd