def predict(): train_code, holdout_code, train_comment, holdout_comment = read_training_files( '../../data/processed_data/') loc = "/home/bohong/文档/mygit/cdpensearch/cdpensearch/oneEncoder/seqmodel.hdf5" seq2seq_Model = load_model(loc) loc = OUTPUT_PATH / 'py_code_proc_v2.dpkl' num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH / 'py_comment_proc_v2.dpkl') seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) demo_testdf = pd.DataFrame({ 'code': holdout_code, 'comment': holdout_comment, 'ref': '' }) # seq2seq_inf.predications(df=demo_testdf) f = open("generatetag.txt") score = seq2seq_inf.evaluate_model(f.readlines(), holdout_comment, max_len=None) f.close() print(score)
def __init__(self): with open('body_pp.dpkl', 'rb') as f: body_pp = dpickle.load(f) with open('title_pp.dpkl', 'rb') as f: title_pp = dpickle.load(f) self.model = Seq2Seq_Inference( encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=load_model('seq2seq_model_tutorial.h5'))
def evaluate_keras(self): """Generates predictions on holdout set and calculates BLEU Score.""" seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=self.body_pp, decoder_preprocessor=self.title_pp, seq2seq_model=self.seq2seq_Model) bleu_score = seq2seq_inf.evaluate_model(holdout_bodies=self.test_df.body.tolist(), holdout_titles=self.test_df.issue_title.tolist(), max_len_title=12) logging.info("Bleu score: %s", bleu_score) return bleu_score
def load_seq2seq_model(self): K.clear_session() seq2seq_Model = load_model( str(self.seq2seq_path / 'code_summary_seq2seq_model.h5')) num_encoder_tokens, enc_pp = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor( self.seq2seq_path / 'py_comment_proc_v2.dpkl') self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model)
def __init__(self): body_pp_file = os.getenv('BODY_PP_FILE', 'body_pp.dpkl') print('body_pp file {0}'.format(body_pp_file)) with open(body_pp_file, 'rb') as body_file: body_pp = dpickle.load(body_file) title_pp_file = os.getenv('TITLE_PP_FILE', 'title_pp.dpkl') print('title_pp file {0}'.format(title_pp_file)) with open(title_pp_file, 'rb') as title_file: title_pp = dpickle.load(title_file) model_file = os.getenv('MODEL_FILE', 'seq2seq_model_tutorial.h5') print('model file {0}'.format(model_file)) self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=load_model(model_file))
def detect(inputs, input_model_h5, input_title_preprocessor_dpkl, input_body_preprocessor_dpkl): # Load model, preprocessors. seq2seq_Model = keras.models.load_model(input_model_h5) num_encoder_tokens, body_pp = load_text_processor( input_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( input_title_preprocessor_dpkl) # Prepare inference. seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=seq2seq_Model) # Output predictions for n random rows in the test set. return seq2seq_inf.generate_issue_title(input[0])
def load_summarizer(seq2seq_model_path, text_processor_path): """ Loads the code summarizer model and returns the interference object to be used for predicting docstrings. Input: ----- Returns: Seq2Seq_Inference object Author: Tyler Medlin """ #the code from the GitHub team has a LOT of soon to be depricated functions #suppress the depricated warnings tf.logging.set_verbosity('ERROR') os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" warnings.filterwarnings("ignore") logging.warning('Loading pre-trained model...') # Load model seq2seq_Model = load_model(seq2seq_model_path + '/py_func_sum_v9_.epoch16-val2.55276.hdf5') logging.warning('Loading text processor (encoder)...') # Load encoder (code) pre-processor num_encoder_tokens, enc_pp = load_text_processor(text_processor_path + '/py_code_proc_v2.dpkl') logging.warning('Loading text processor (decoder)...') # Load decoder (docstrings/comments) pre-processor num_decoder_tokens, dec_pp = load_text_processor( text_processor_path + '/py_comment_proc_v2.dpkl') graph = tf.get_default_graph() seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) return seq2seq_inf, graph
def create_autotag(self, postgres, file_id): K.clear_session() seq2seq_Model = load_model( str(self.seq2seq_path / 'code_summary_seq2seq_model.h5')) num_encoder_tokens, enc_pp = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor( self.seq2seq_path / 'py_comment_proc_v2.dpkl') self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) paras, paraids, autotags, manualtags = postgres.get_paragraphs_fileid( file_id) paras = [str(item) for item in paras] no_docstring_funcs = paras no_docstring_paraids = paraids print("no_docstring_paraids = ", no_docstring_paraids) print("size of paragraphs = ", len(no_docstring_funcs)) print("size of paraids = ", len(no_docstring_paraids)) demo_testdf = pd.DataFrame({ 'code': no_docstring_funcs, 'comment': '', 'ref': '' }) auto_tag = self.seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf) print("size of auto_tag = ", len(auto_tag)) with open(self.data_path / 'without_docstrings.autotag', 'w', encoding='utf-8') as f: index = 0 for item in auto_tag: f.write("%s\n" % item) paraid = no_docstring_paraids[index] # paraid = paraid.strip() updated_rows = postgres.update_autotag(paraid, item) index = index + 1 K.clear_session()
def load_models(self): K.clear_session() print("Going to load 'code_summary_seq2seq_model.h5'") seq2seq_Model = load_model( str(self.seq2seq_path / 'code_summary_seq2seq_model.h5')) print("Going to load 'py_code_proc_v2.dpkl'") num_encoder_tokens, enc_pp = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') print("Going to load 'py_comment_proc_v2.dpkl'") num_decoder_tokens, dec_pp = load_text_processor( self.seq2seq_path / 'py_comment_proc_v2.dpkl') print("Going to load 'Seq2Seq_Inference'") self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) print("Going to load 'code2emb_model.hdf5'") self.code2emb_model = load_model(str(self.code2emb_path / 'code2emb_model.hdf5'), custom_objects=None, compile=False) print("Going to load 'py_code_proc_v2.dpkl'") self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl')
import os import cPickle as pickle from keras.models import load_model import pandas as pd with open('github_issues-bundle.pkl', 'rb') as fp: X, Y, decoder_target_data, idx2word, word2idx = pickle.load(fp) with open('github_issues.pkl', 'rb') as fp: train_dataset, test_dataset = pickle.load(fp) print('Train',train_dataset.shape) print('Test',test_dataset.shape) seq2seq_Model = load_model('seq2seq_model_keras.h5') from seq2seq_utils import Seq2Seq_Inference seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=X, decoder_preprocessor=Y, seq2seq_model=seq2seq_Model, idx2word= idx2word, word2idx= word2idx) # this method displays the predictions on random rows of the holdout set seq2seq_inf.demo_model_predictions(n=1, issue_df=X, threshold=1) print('EOP')
from seq2seq_utils import Seq2Seq_Inference # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--input_model_h5") parser.add_argument("--input_body_preprocessor_dpkl") parser.add_argument("--input_title_preprocessor_dpkl") parser.add_argument("--input_testdf_csv") parser.add_argument("--input_prediction_count", type=int, default=50) args = parser.parse_args() print(args) # Read data. testdf = pd.read_csv(args.input_testdf_csv) # Load model, preprocessors. seq2seq_Model = keras.models.load_model(args.input_model_h5) num_encoder_tokens, body_pp = load_text_processor( args.input_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.input_title_preprocessor_dpkl) # Prepare inference. seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=seq2seq_Model) # Output predictions for n random rows in the test set. seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
# # To evaluate this model we are going to do two things: # # 1. Manually inspect the results of predicted docstrings for code snippets, to make sure they look sensible. # 2. Calculate the [BLEU Score](https://en.wikipedia.org/wiki/BLEU) so that we can quantitately benchmark different iterations of this algorithm and to guide hyper-parameter tuning. # ### Manually Inspect Results (on holdout set) # In[15]: from seq2seq_utils import Seq2Seq_Inference import pandas as pd seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) demo_testdf = pd.DataFrame({'code':holdout_code, 'comment':holdout_comment, 'ref':''}) seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf) # ### Comment on manual inspection of results: # # The predicted code summaries are not perfect, but we can see that the model has learned to extract some semantic meaning from the code. That's all we need to get reasonable results in this case. # ### Calculate BLEU Score (on holdout set) # # BLEU Score is described [in this wikipedia article](https://en.wikipedia.org/wiki/BLEU), and is a way to measure the efficacy of summarization/translation such as the one we conducted here. This metric is useful if you wish to conduct extensive hyper-parameter tuning and try to improve the seq2seq model. # In[24]: