def process_data_for_language_model(val_sample=5000, tokenizer=None): """ Read the IMDB folder files and create language model training and validation file :params: - save_path: Path where to save lm_train.txt and lm_valid.txt - imdb_path: Root directory of imdb dataset - val_sample: Number of files to select as validation set """ all_files = [] for root, dirnames, filenames in os.walk(DATA_DIR): all_files.extend( [os.path.join(root, filename) for filename in filenames]) train_files, test_files = train_test_split(all_files, test_size=0.1) test_text = [ open(i, encoding='utf-8', errors='ignore').read() for i in test_files ] train_text = [ open(i, encoding='utf-8', errors='ignore').read() for i in train_files ] tokenizer = get_tokenizer(tokenizer) process = lambda x: [' '.join(tokenizer(i)) for i in tqdm(x)] test = process(test_text) train = process(train_text) open(os.path.join(save_path, LM_TRAIN_FILE), "w").write("\n".join(train)) open(os.path.join(save_path, LM_VAL_FILE), "w").write("\n".join(test))
def process_data_for_language_model(save_path, imdb_path, val_sample=5000, tokenizer=None): """ Read the IMDB folder files and create language model training and validation file :params: - save_path: Path where to save lm_train.txt and lm_valid.txt - imdb_path: Root directory of imdb dataset - val_sample: Number of files to select as validation set """ tokenizer = get_tokenizer(tokenizer) pos = [open(os.path.join(imdb_path, TRAIN_POS_DIR, i)).read() for i in\ tqdm(os.listdir(os.path.join(imdb_path, TRAIN_POS_DIR)))] neg = [open(os.path.join(imdb_path, TRAIN_NEG_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TRAIN_NEG_DIR)))] test_pos = [open(os.path.join(imdb_path, TEST_POS_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TEST_POS_DIR)))] test_neg = [open(os.path.join(imdb_path, TEST_NEG_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TEST_NEG_DIR)))] unsup = [open(os.path.join(imdb_path, TRAIN_UNSUP_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TRAIN_UNSUP_DIR)))] process = lambda x: [' '.join(tokenizer(i)) for i in tqdm(x)] pos = process(pos) neg = process(neg) test_pos = process(test_pos) test_neg = process(test_neg) unsup = process(unsup) text = pos + neg + unsup + test_neg + test_pos print('Test Data Creation..') val_index = random.sample(list(range(len(text))), val_sample) train = [i for idx, i in enumerate(text) if idx not in val_index] val = [i for idx, i in enumerate(text) if idx in val_index] open(os.path.join(save_path, LM_TRAIN_FILE), "w").write("\n".join(train)) open(os.path.join(save_path, LM_VAL_FILE), "w").write("\n".join(val))
params = default_params() ntokens = sum(1 for _ in open(os.path.join(args.data_dir, 'vocab.txt'))) + 1 if args.model == 'DAN': model = Model(emb_dim=64, ntokens=ntokens, hidden_dim=32, output_dim=16).to(device) vocab = load_vocab(os.path.join(args.data_dir, 'vocab.txt')) tokenizer = None elif args.model in ['GPT', 'GPT-2']: model = PretrainedModel(model_name=args.model).to(device) vocab = None tokenizer = get_tokenizer(args.model) else: raise NotImplementedError(f'{args.model} --- no such model') model = load_checkpoint( model, os.path.join(args.model_dir, f'checkpoint_{args.epoch}')) pred = Predictor(model, tokenizer) dataset = CsvDataset(csv_path=os.path.join(args.data_dir, f'data.csv'), vocab=vocab, max_len=50, tokenizer=tokenizer) dataloader = DataLoader(dataset, batch_size=1, shuffle=False,
def pretrain_encoder(train_file, valid_file,\ save_folder='saved_model/base', tokenizer=None, restore=False, **kwargs): """ Module for running the training and validation subroutines. :params: - train_file: Training File, File with sentences separated by newline - valid_file: Validation File, same format as above - save_folder: Folder to save output files and models - restore: Whether to restore from the save_folder (can be used to finetune on a smaller dataset) - tokenizer: Tokenizer to use for tokenizing sentences into tokens - **kwargs: other params: * batch_size * hidden_size * num_layers * epochs * seq_length :outputs: None """ config = FW_CONFIG tokenizer = get_tokenizer(tokenizer) if tokenizer else None batch_size = kwargs.get("batch_size") or FW_CONFIG["batch_size"] hidden_size = kwargs.get("hidden_size") or FW_CONFIG["hidden_size"] num_layers = kwargs.get("num_layers") or FW_CONFIG["num_layers"] epochs = kwargs.get("epochs") or FW_CONFIG.pop("epochs") if "epochs" in FW_CONFIG: FW_CONFIG.pop("epochs") FW_CONFIG["num_candidate_samples"] = kwargs.get("num_candidate_samples") or FW_CONFIG["num_candidate_samples"] seq_length = FW_CONFIG.pop("seq_length") learning_rate = kwargs.get("learning_rate", 0.001) optimizer = kwargs.get("optimizer", "adam") print_progress = kwargs.get("print_progress", False) type_ = kwargs.get("type", "rnn") learning_rate_decay = 0.1 lr_cosine_decay_params = { "learning_rate": learning_rate, "first_decay_steps": 2000, "t_mul": 2.0, "alpha": 0.01 } tokenizer_json_file = os.path.join(save_folder, "tokenizer.json") # Load data and Batchify all_data = load_and_process_data(train_file, valid_file, max_vocab_size=config["max_vocab_size"], custom_tokenizer_function=tokenizer, tokenizer_json_file=tokenizer_json_file, restore_from=tokenizer_json_file if restore else None) word_freq, word_index, train_data, valid_data = all_data X_train, y_train = batchify(train_data, batch_size) X_valid, y_valid = batchify(valid_data, batch_size) # Save the Vocab and frequency files if not os.path.exists(save_folder): os.makedirs(save_folder) # Saving config and word_index file json.dump(word_index, open(os.path.join(save_folder, "word_index.json"), "w")) json.dump(word_freq, open(os.path.join(save_folder, "word_freq.json"), "w")) json.dump(FW_CONFIG, open(os.path.join(save_folder, "config.json"), "w")) # Arranging tokens in alist, this will go in vocab file vocab = [" "] + [i[0] for i in sorted(word_index.items(), key=lambda x: x[1])][:FW_CONFIG["max_vocab_size"]+1] open(os.path.join(save_folder, "vocab.txt"), "w").write("\n".join(vocab)) open(os.path.join(save_folder, "word_freqs.txt"), "w").write("\n".join(word_index)) # Check max_vocab_size FW_CONFIG["max_vocab_size"] = min(len(word_index) + 1, FW_CONFIG["max_vocab_size"]) print("Vocabulary Size: {}".format(FW_CONFIG["max_vocab_size"])) # Define Placeholder and Initial States inputs = tf.placeholder(dtype=tf.int32, shape=(batch_size,None), name='input') targets = tf.placeholder(dtype=tf.int64, shape=(batch_size,None), name='target') initial_state_c = tf.placeholder(dtype=tf.float32, shape=(num_layers, batch_size, hidden_size),\ name='input_state_c') initial_state_h = tf.placeholder(dtype=tf.float32, shape=(num_layers, batch_size, hidden_size),\ name='input_state_h') # Create the Graph train_op, training_flag, sampled_loss,\ loss, rnn_states, weights, learning_rate_var = language_model_graph(inputs, targets, (initial_state_c, initial_state_h), vocab_freqs=word_freq, optimizer=optimizer, type_=type_, **config) final_state_c, final_state_h = rnn_states sess = tf.Session() sess.run(tf.global_variables_initializer()) print("total number of trainable params {}".format(get_trainbale_params())) # Define run epoch function params (passed as kwargs) run_epoch_params = {"session": sess, "sampled_loss": sampled_loss, "loss": loss, "num_layers": num_layers, "input_placeholder": inputs, "target_placeholder": targets, "initial_state_c": initial_state_c, "initial_state_h": initial_state_h, "learning_rate_var":learning_rate_var, "learning_rate":learning_rate, "train_op": train_op, "final_state_c": final_state_c, "final_state_h": final_state_h, "seq_length": seq_length, "batch_size": batch_size, "hidden_size":hidden_size, "training_flag": training_flag, "lr_cosine_decay_params": lr_cosine_decay_params} valid_losses = [1000] vars = tf.trainable_variables() vars = [i for i in vars if 'optimizer' not in i.name] saver = tf.train.Saver(vars) if restore: saver.restore(sess, os.path.join(save_folder, "model.ckpt")) for epoch in range(epochs): decay = (learning_rate_decay ** int((max(epoch - 5, 0)/2))) run_epoch_params['learning_rate'] = learning_rate * decay # Training Epoch train_loss = _run_epoch(X_train, y_train, train=True, epoch=epoch, print_progress=print_progress, **run_epoch_params) # Valid Epoch valid_loss = _run_epoch(X_valid, y_valid, train=False, print_progress=False, epoch=epoch, **run_epoch_params) format_values = [epoch, train_loss, np.exp(train_loss),\ valid_loss, np.exp(valid_loss)] print("Epoch {0}, Train Loss {1:.2f}, Train Perplexity {2:.2f},\ Val Loss {3:.2f}, Val Perplexity {4:.2f}".format(*format_values)) if valid_loss < min(valid_losses): saver.save(sess, os.path.join(save_folder, "model.ckpt")) numpy_weights = {} weights_ = weights for layer in weights: numpy_weights[layer] = sess.run(weights[layer]) weights = weights_ pickle.dump(numpy_weights, open(os.path.join(save_folder, "weights.pkl"), "wb")) valid_losses.append(valid_loss)
def create_tfrecords(params, write_remainder=True, write_every_n_files=1, save_checkpoints=False, resume_from_checkpoint=False, display_pbar=False): # iterates through files in input_dir, splitting into <args.chunk_size> chunks and saving a tfrecords file every <args.files_per> chunks. files, args, process_no = params enc = get_tokenizer() # get tokenizer # init metadata discarded_files = 0 files_processed = 0 pbar = tqdm( desc= f"Writing TFRecord Files to {args.output_dir}. Parsed 0 input files. files_written ", disable=not display_pbar) checkpoint_path = f"{args.output_dir}/checkpoint.txt" resume_files_processed, tfrecord_count = read_checkpoint( checkpoint_path, resume_from_checkpoint) data_to_prepend = [] tokenized_files_array = [] for f in files: for tokenized_files in archive_to_tokens(f, enc, args): files_processed += 1 if files_processed < resume_files_processed: continue # resume from checkpoint # if the last chunk < chunk size, but > minimum_size, take it and append it to the beginning of the next file n_tokens = len(tokenized_files[-1]) if n_tokens < args.chunk_size: data = tokenized_files.pop(-1) if n_tokens >= args.minimum_size: data_to_prepend.extend(data) else: discarded_files += 1 if len(data_to_prepend) >= args.chunk_size: # if length of data_to_prepend becomes greater than chunk size, add concatted files to tokenized files tokenized_files_array.append(data_to_prepend[:args.chunk_size]) data_to_prepend = data_to_prepend[args.chunk_size:] # add tokenized files > chunk size to main array tokenized_files_array.extend(tokenized_files) if len( tokenized_files_array ) >= args.files_per * write_every_n_files: # write every n files _tfrecord_count, remainder = write_files( tokenized_files_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, process_no=process_no) pbar.update(_tfrecord_count - tfrecord_count) # update progress bar pbar.set_description( f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written " ) tfrecord_count = _tfrecord_count tokenized_files_array = remainder if remainder is not None else [ ] # add remaining files to next chunk with open(checkpoint_path, "w") as checkpoint_file: checkpoint_file.write( f"{files_processed}, {tfrecord_count}") if len(tokenized_files_array) >= args.files_per: # also write at end _tfrecord_count, remainder = write_files(tokenized_files_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, process_no=process_no) pbar.update(_tfrecord_count - tfrecord_count) pbar.set_description( f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written " ) tfrecord_count = _tfrecord_count with open(checkpoint_path, "w") as checkpoint_file: checkpoint_file.write(f"{files_processed}, {tfrecord_count}") else: remainder = tokenized_files_array # add remaining to remainder if write_remainder: # write out the remaining files even if there's less than files_per write_files(remainder, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, write_remainder=True) successful_files = files_processed - discarded_files return { "discarded": discarded_files, "processed": files_processed, "successful": successful_files }
def process_data_for_classification(imdb_path, val_sample=5000, tokenizer='nltk', maxlen=500): """ Read the IMDB folder files and create language model training and validation file :params: - save_path: Path where to save lm_train.txt and lm_valid.txt - imdb_path: Root directory of imdb dataset - val_sample: Number of files to select as validation set """ file_name = 'imdb_processed_data_maxlen_{}_tokenizer_{}_val_{}.pkl'.format( maxlen, tokenizer, val_sample) if not os.path.exists(file_name): pos = [open(os.path.join(imdb_path, TRAIN_POS_DIR, i)).read() for i in\ tqdm(os.listdir(os.path.join(imdb_path, TRAIN_POS_DIR)))] neg = [open(os.path.join(imdb_path, TRAIN_NEG_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TRAIN_NEG_DIR)))] test_pos = [open(os.path.join(imdb_path, TEST_POS_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TEST_POS_DIR)))] test_neg = [open(os.path.join(imdb_path, TEST_NEG_DIR, i)).read() for i in \ tqdm(os.listdir(os.path.join(imdb_path, TEST_NEG_DIR)))] train = pos + neg test = test_pos + test_neg print('Test Data Creation..') val_index = random.sample(list(range(len(train))), val_sample) x_train = [i for idx, i in enumerate(train) if idx not in val_index] y_train = [ 1 if idx <= len(pos) else 0 for idx, i in enumerate(train) if idx not in val_index ] x_val = [i for idx, i in enumerate(train) if idx in val_index] y_val = [ 1 if idx <= len(pos) else 0 for idx, i in enumerate(train) if idx in val_index ] x_test = test y_test = [1] * len(test_pos) + [0] * len(test_neg) if not tokenizer: tokenizer = text_to_word_sequence else: tokenizer = get_tokenizer(get_tokenizer) x_train = list(map(tokenizer, x_train)) x_val = list(map(tokenizer, x_val)) x_test = list(map(tokenizer, x_test)) custom_pad_sequences = lambda x: pad_sequences( x, maxlen=maxlen, dtype=object, padding='pre', value="-pad-") x_train = custom_pad_sequences(x_train) x_val = custom_pad_sequences(x_val) x_test = custom_pad_sequences(x_test) pickle.dump([x_train, y_train, x_val, y_val, x_test, y_test], open(file_name, 'wb')) return x_train, y_train, x_val, y_val, x_test, y_test else: x_train, y_train, x_val, y_val, x_test, y_test = pickle.load( open(file_name, 'rb')) return x_train, y_train, x_val, y_val, x_test, y_test
def main_siamese_lstm(bug_contents_path, code_contents_path, file_oracle_path, sequence_oracle_path, model_dir_path, prediction_dir_path,evaluation_file_path, vocabulary_size, lstm_core_length, word2vec_model_path = None, lstm_seq_length = 200, sample_num = 50, split_ratio = 0.8, activation_function = 'tanh', inner_activation_function = 'hard_sigmoid', distance_function = 'cos', initializer = 'glorot_uniform', inner_initializer = 'orthogonal', regularizer = None, optimizer = RMSprop(lr=0.001, rho = 0.9, epsilon=1e-8, decay=0.0), dropout = 0.0, epoch_num = 100, k_value = 10, rel_threshold = 0.5, embedding_dimension = -1, word2vec = False): if not os.path.isdir(model_dir_path): os.mkdir(model_dir_path) #Loading the pretrained word2vec model word2vec_model = None if word2vec == True: print("loading word2vec model:") word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=False) print("finished loading word2vec model.") #Loading the generated data from file print("loading data from file:") [bug_contents,code_contents,file_oracle,sequence_oracle] = load_data(bug_contents_path, code_contents_path, file_oracle_path, sequence_oracle_path, split_length = lstm_seq_length, encoding = 'utf-8') print("finished loading data from file.") #Initializing the tokenizer print("initializing tokenizer:") tokenizer = get_tokenizer(bug_contents, code_contents, vocabulary_size) print("finished initializing tokenizer.") #The previous bugs are used for training #The remaining bugs are used for testing nb_train_bug = int(math.floor(len(bug_contents)* split_ratio)) #Building the LSTM Siamese Network. print("building lstm siamese network:") model = siamese_lstm(lstm_seq_length, vocabulary_size, lstm_core_length, activation_function = activation_function, inner_activation_function = inner_activation_function,distance_function = distance_function, initializer = initializer, inner_initializer = inner_initializer, regularizer = regularizer, optimizer = optimizer, dropout = dropout, embedding_dimension = embedding_dimension) #Saving the Model Structure to File model_structure_path = os.path.join(model_dir_path, "model_structure") save_model_structure(model, model_structure_path) print("finished building lstm siamese network.") #Building the LSTM Validation Set bug_val = np.zeros((0,lstm_seq_length, vocabulary_size)) code_val = np.zeros((0,lstm_seq_length, vocabulary_size)) rel_val = np.zeros((0,)) bug_contents_val = bug_contents[nb_train_bug:] nb_validation_bug = 100 for bug_batch, code_batch, label_batch in batch_gen(bug_contents_val, sequence_oracle, tokenizer, vocabulary_size, lstm_seq_length, nb_validation_bug, word2vec_model, embedding_dimension= embedding_dimension, sample_num = sample_num, word2vec = word2vec): bug_val = np.vstack((bug_val,bug_batch)) code_val = np.vstack((code_val,code_batch)) rel_val = np.append(rel_val,label_batch, axis = 0) print(bug_val.shape) #Training the LSTM Siamese Network print("training lstm siamese network:") acc_train_list = [] acc_val_list = [] for epoch in range(epoch_num): bug_train = np.zeros((0,lstm_seq_length, vocabulary_size)) code_train = np.zeros((0,lstm_seq_length, vocabulary_size)) rel_train = np.zeros((0,)) print("training epoch {}:".format(epoch)) batch_index = 1 for bug_batch, code_batch, label_batch in batch_gen(bug_contents, sequence_oracle, tokenizer, vocabulary_size, lstm_seq_length, nb_train_bug, word2vec_model, embedding_dimension= embedding_dimension, sample_num = sample_num, word2vec = word2vec): print("training batch {}, size {}".format(batch_index, len(bug_batch))) model.train_on_batch([bug_batch, code_batch], label_batch) batch_index = batch_index + 1 bug_train = np.vstack((bug_train,bug_batch)) code_train = np.vstack((code_train,code_batch)) rel_train = np.append(rel_train,label_batch, axis = 0) #predicting the training accuracy of this batch pred_batch = model.predict([bug_batch, code_batch]) print(pred_batch) print(label_batch) acc_batch = predict_accuracy(pred_batch, label_batch) print("training accuracy = {}".format(acc_batch)) #predicting the validation accuracy of this batch pred_val = model.predict([bug_val, code_val]) print(pred_val) print(rel_val) acc_val = predict_accuracy(pred_val, rel_val) print("validation accuracy = {}".format(acc_val)) acc_val_list.append(acc_val) #save the model weights after this epoch to file one_epoch_weight_path = os.path.join(model_dir_path, "weight_epoch_{}".format(epoch)) save_model_weights(model,one_epoch_weight_path) #compute the valiation accuracy #pred_val = model.predict([bug_val, code_val]) #acc_val = predict_accuracy(pred_val, rel_val) #print("validation accuracy = {}".format(acc_val)) #acc_val_list.append(acc_val) print("finished training lstm siamese network.") #plt.plot(acc_train_list) #plt.plot(acc_val_list) #plt.savefig('learning_curve.eps') # Generating Predictions on the Test Bugs print("computing predictions on the test data:") #Code Vectors code_vec_list = generate_code_vec(model, code_contents, lstm_seq_length,tokenizer, vocabulary_size, word2vec_model, embedding_dimension = embedding_dimension, word2vec = word2vec) #Test Bug Vectors bug_vec_list = generate_bug_vec(model, bug_contents[nb_train_bug:], lstm_seq_length, tokenizer, vocabulary_size,word2vec_model, embedding_dimension = embedding_dimension, word2vec = word2vec) #Generating Oracles for Test Bugs test_oracle = generate_test_oracle(file_oracle[nb_train_bug:]) #Generating Prediction Scores for Each Test Bug predictions = generate_predictions_full(bug_vec_list, code_vec_list) if not os.path.isdir(prediction_dir_path): os.mkdir(prediction_dir_path) i = 1 #Traversing each bug oracle/prediction results for one_test_oracle, prediction in zip(test_oracle, predictions): if len(one_test_oracle)>0: #Export file_path = os.path.join(prediction_dir_path, "bug_num_{}".format(i)) export_one_bug_prediction(one_test_oracle, prediction, file_path) #Some strategies for ... #evaluations = evaluate_one_bug(prediction, one_test_oracle) # print(evaluations) #export_one_evaluation(evaluations, evaluation_file_path) i = i+1 print("finished computing predictions on the test data.") #Evaluating Performance on Test Bugs print("evaluating performance on the test data:") evaluate_prediction_dir(prediction_dir_path, evaluation_file_path) print("finished evaluating performance on the test data.")
def _load_quora_data(data_file,\ max_length=60, validation_split=5000, test_split=5000, seed=100, tokenizer="nltk", processor_config_filepath='preprocessor.pkl'): """ Load Quora Dataset from TSV file :params: - data_file: TSV data file provided by Quora - max_length: Max Length of Questions, Questions data will be truncated upto this length - validation_split: How much to sample for validation - test_split: How much to sample for testing - seed: Random seed - processor_config_filepath: Where to save tokenizer etc """ # Read data file and assign column names data = pd.read_csv(data_file, sep='\t') # Shuffle and split dataframe np.random.seed(seed) data.iloc[np.random.permutation(len(data))] train_df, valid_df, test_df = data.iloc[:-(validation_split+test_split)],\ data.iloc[-(validation_split+test_split):-test_split],\ data.iloc[-test_split:, :] convert_list_to_str = lambda x: list(map(str, x)) train_question1 = convert_list_to_str(train_df['question1'].tolist()) train_question2 = convert_list_to_str(train_df['question2'].tolist()) y_train = train_df['is_duplicate'] valid_question1 = convert_list_to_str(valid_df['question1'].tolist()) valid_question2 = convert_list_to_str(valid_df['question2'].tolist()) y_valid = valid_df['is_duplicate'] test_question1 = convert_list_to_str(test_df['question1'].tolist()) test_question2 = convert_list_to_str(test_df['question2'].tolist()) y_test = test_df['is_duplicate'] if not tokenizer: tokenizer = text_to_word_sequence else: tokenizer = get_tokenizer(get_tokenizer) def process_list_of_text(list_of_text): tokenized = list(map(tokenizer, x_train)) return pad_sequences(tokenized, maxlen=max_length, dtype=object, padding='pre', value="-pad-") # Processing Training Data train_question1 = process_list_of_text(train_question1) train_question2 = process_list_of_text(train_question2) # Processing Validation Data valid_question1 = process_list_of_text(valid_question1) valid_question2 = process_list_of_text(valid_question2) # Processing Test Data test_question1 = process_list_of_text(test_question1) test_question2 = process_list_of_text(test_question2) return (train_question1, train_question2), y_train,\ (valid_question1, valid_question2), y_valid,\ (test_question1, test_question2), y_test