def generate_training_data(db_path, bins, min_program_length, max_program_length, \ max_fix_length, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format mutator_obj = Typo_Mutate_Java(rng) mutate = partial(typo_mutate, mutator_obj) token_strings = {'train': {}, 'validation': {}, 'test': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] code_id_list = [] for bin_ in bins: for problem_id in bin_: code_id_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() code_id_list = [] query = "SELECT code_id FROM Code WHERE codelength>? and codelength<?;" for row in cursor.execute(query, (min_program_length, max_program_length)): # get all the code_id code_id_list.append(row[0]) rng.shuffle(code_id_list) # split into train, valiation and test test: 80%, 10%, 10% validation_code_id_list = code_id_list[0:int(0.1 * len(code_id_list))] test_code_id_list = code_id_list[int(0.1 * len(code_id_list) ):int(0.1 * len(code_id_list)) * 2] training_code_id_list = code_id_list[int(0.1 * len(code_id_list)) * 2:] # make sure they do not intersect assert list(set(training_code_id_list) & set(validation_code_id_list)) == [] assert list(set(training_code_id_list) & set(test_code_id_list)) == [] assert list(set(validation_code_id_list) & set(test_code_id_list)) == [] query = "SELECT code_id, tokenized_code, codelength FROM Code " + "WHERE codelength>? and codelength<?;" total_variant_cnt = 0 for row in cursor.execute(query, (min_program_length, max_program_length)): code_id = row[0] tokenized_program = row[1] if code_id in validation_code_id_list: key = 'validation' if code_id in test_code_id_list: key = 'test' if code_id in training_code_id_list: key = 'train' # number of tokens program_length = row[2] # row[2] is codelength program_lengths.append(program_length) if program_length > min_program_length and program_length < max_program_length: # start to mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_program, max_mutations, max_variants) except FailedToMutateException: print code_id exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: print code_id exceptions_in_mutate_call += 1 except ValueError: print code_id exceptions_in_mutate_call += 1 raise except AssertionError: print code_id exceptions_in_mutate_call += 1 raise except Exception: print code_id exceptions_in_mutate_call += 1 raise else: tokenized_program = remove_empty_new_lines( convert_to_new_line_format(tokenized_program)) for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: corrupt_program = remove_empty_new_lines( convert_to_new_line_format(corrupt_program)) total_variant_cnt += 1 try: token_strings[key][code_id] += [ (code_id, corrupt_program, tokenized_program) ] except: token_strings[key][code_id] = [ (code_id, corrupt_program, tokenized_program) ] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean( program_lengths), '\t95th %ile =', program_lengths[int( 0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean( fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' return token_strings, mutator_obj.get_mutation_distribution()
def __init__(self, dataset, step_penalty, seed, GE_ratio=None, top_down_movement=True, single_delete=True, reject_spurious_edits=True, compilation_error_store=None, train_data_size=0, valid_data_size=0, test_data_size=0, GE_code_ids=None, actions=None, verbose=False, single_program=None, sparse_rewards=True): assert (GE_ratio is None and GE_code_ids is not None) or (GE_ratio is not None and GE_code_ids is None) Env_engine.__init__(self, dataset.get_tl_dict(), seed, step_penalty=step_penalty, top_down_movement=top_down_movement, reject_spurious_edits=reject_spurious_edits, compilation_error_store=compilation_error_store, single_delete=single_delete, actions=actions, sparse_rewards=sparse_rewards) if single_program is not None: td = self.tl_dict tokenized_program, name_dict, name_sequence = C_Tokenizer( ).tokenize_single_program(single_program) single_ex_dataset = namedtuple('single_ex_dataset', ['single_ex', 'name_dict_store'], verbose=True) self.dataset = single_ex_dataset( single_ex={ 'single': (self.vectorize(tokenized_program), [td['EOF'], td['-new-line-'], td['_pad_']]) }, name_dict_store={'single': (name_dict, name_sequence)}) self.data_sizes = {'single': 1} self.code_ids = {'single': ['single']} else: self.verbose = verbose self.dataset = dataset train_data_size = dataset.data_size[ 0] if train_data_size == 0 else min(train_data_size, dataset.data_size[0]) valid_data_size = dataset.data_size[ 1] if valid_data_size == 0 else min(valid_data_size, dataset.data_size[1]) test_data_size = dataset.data_size[ 2] if test_data_size == 0 else min(test_data_size, dataset.data_size[2]) train_code_ids = self.dataset.train_ex.keys()[:train_data_size] guided_train_data_size = int( GE_ratio * train_data_size) if GE_code_ids is None else len(GE_code_ids) if GE_code_ids is None: guided_train_code_ids = set( self.rng.choice(train_code_ids, guided_train_data_size, replace=False)) else: guided_train_code_ids = GE_code_ids # raw test dataset real_data_size = 0 real_data_keys = [] try: self.real_test_data = self.dataset.real_test_data except AttributeError: pass else: real_data_size = len(self.real_test_data) real_data_keys = self.real_test_data.keys() # seeded test dataset seeded_data_size = 0 seeded_data_keys = [] try: self.seeded_test_data = self.dataset.seeded_test_data except AttributeError: pass else: seeded_data_size = len(self.seeded_test_data) seeded_data_keys = self.seeded_test_data.keys() self.data_sizes = { 'train': train_data_size, 'valid': valid_data_size, 'test': test_data_size, 'real': real_data_size, 'seeded': seeded_data_size, 'GE_train': guided_train_data_size } self.code_ids = { 'train': train_code_ids, 'GE_train': guided_train_code_ids, 'valid': self.dataset.valid_ex.keys(), 'test': self.dataset.test_ex.keys(), 'real': real_data_keys, 'seeded': seeded_data_keys } self.rng.shuffle(self.code_ids['train'])
import os import sqlite3 import json from util.c_tokenizer import C_Tokenizer tokenize = C_Tokenizer().tokenize db_path = 'C:\\UNI\\projects\\rlassist\\data\\iitk-dataset\\prutor_b.db' # # with sqlite3.connect(db_path) as conn: # conn.execute('''ALTER TABLE Code ADD tokenized_code text;''') # conn.execute('''ALTER TABLE Code ADD name_dict;''') # conn.execute('''ALTER TABLE Code ADD name_seq;''') # conn.execute('''ALTER TABLE Code ADD codelength integer;''') tuples = [] with sqlite3.connect(db_path) as conn: cursor = conn.cursor() for row in cursor.execute("SELECT code_id, code FROM Code;"): code_id = str(row[0]) if code_id == "prog56277": print("code id:", code_id) code = row[1].encode('utf-8') print(code) tokenized_code, name_dict, name_seq = tokenize(code) print(tokenized_code) print(name_dict) print(name_seq) codelength = len(tokenized_code.split()) tuples.append((tokenized_code, json.dumps(name_dict), json.dumps(name_seq), codelength, code_id)) #
import argparse import sqlite3 import numpy as np from data_processing.training_data_generator import load_dictionaries from util.helpers import remove_empty_new_lines from util.c_tokenizer import C_Tokenizer deepfix_base_dir = 'data/deepfix-test-data/' RLAssist_base_dir = 'data/network_inputs/RLAssist-seed-1189/' iitk_db_path = 'data/iitk-dataset/dataset.db' max_program_len = 45000 dummy_correct_program = '_eos_ -new-line- _pad_' tokenize = C_Tokenizer().tokenize convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format convert_to_rla_format = lambda x: remove_empty_new_lines( convert_to_new_line_format(x)) raw_test_data = {} seeded_test_data = {} def vectorize(tokens, tldict, max_vector_length=max_program_len): vec_tokens = [] for token in tokens.split(): try: vec_tokens.append(tldict[token]) except Exception: return None
def generate_training_data(db_path, bins, validation_users, min_program_length, max_program_length, \ max_fix_length, max_mutations, max_variants, seed): rng = np.random.RandomState(seed) tokenize = C_Tokenizer().tokenize convert_to_new_line_format = C_Tokenizer().convert_to_new_line_format mutator_obj = Typo_Mutate(rng) mutate = partial(typo_mutate, mutator_obj) token_strings = {'train': {}, 'validation': {}} exceptions_in_mutate_call = 0 total_mutate_calls = 0 program_lengths, fix_lengths = [], [] problem_list = [] for bin_ in bins: for problem_id in bin_: problem_list.append(problem_id) with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query = "SELECT user_id, code_id, tokenized_code FROM Code " + "WHERE problem_id=? and codelength>? and codelength<? and errorcount=0;" for problem_id in problem_list: for row in cursor.execute(query, (problem_id, min_program_length, max_program_length)): user_id, code_id, tokenized_program = map(str, row) key = 'validation' if user_id in validation_users[problem_id] else 'train' program_length = len(tokenized_program.split()) program_lengths.append(program_length) if program_length >= min_program_length and program_length <= max_program_length: # Mutate total_mutate_calls += 1 try: iterator = mutate(tokenized_program, max_mutations, max_variants) except FailedToMutateException: exceptions_in_mutate_call += 1 except LoopCountThresholdExceededException: exceptions_in_mutate_call += 1 except ValueError: exceptions_in_mutate_call += 1 raise except AssertionError: exceptions_in_mutate_call += 1 raise except Exception: exceptions_in_mutate_call += 1 raise else: tokenized_program = remove_empty_new_lines(convert_to_new_line_format(tokenized_program)) for corrupt_program, fix in iterator: corrupt_program_length = len(corrupt_program.split()) fix_length = len(fix.split()) fix_lengths.append(fix_length) if corrupt_program_length >= min_program_length and \ corrupt_program_length <= max_program_length and fix_length <= max_fix_length: corrupt_program = remove_empty_new_lines(convert_to_new_line_format(corrupt_program)) try: token_strings[key][problem_id] += [(code_id, corrupt_program, tokenized_program)] except: token_strings[key][problem_id] = [(code_id, corrupt_program, tokenized_program)] program_lengths = np.sort(program_lengths) fix_lengths = np.sort(fix_lengths) print 'Statistics' print '----------' print 'Program length: Mean =', np.mean(program_lengths), '\t95th %ile =', program_lengths[int(0.95 * len(program_lengths))] try: print 'Mean fix length: Mean =', np.mean(fix_lengths), '\t95th %ile = ', fix_lengths[int(0.95 * len(fix_lengths))] except Exception as e: print e print 'fix_lengths' print fix_lengths print 'Total mutate calls:', total_mutate_calls print 'Exceptions in mutate() call:', exceptions_in_mutate_call, '\n' for key in token_strings: print key for problem_id in token_strings[key]: print problem_id, len(token_strings[key][problem_id]) return token_strings, mutator_obj.get_mutation_distribution()