def __init__(self, x_size, h_size, type_relationship: Union[str, Dict]): """type relationship is a dict with information about children key: tuple with ids for src group, e.g. If and Switch statements value: list of lists, where each list corresponding to type ids of some group { ... (src_type_id_1, ..., src_type_id_n): [ [dst_group_type_id_1, ..., dst_group_type_id_k], ..., [dst_group_type_id_1, ..., dst_group_type_id_m] ] ... } """ super().__init__(x_size, h_size) if isinstance(type_relationship, str): with open(type_relationship, 'rb') as pkl_file: self.type_relationship = pkl_load(pkl_file) else: self.type_relationship = type_relationship count_diff_matrix = 1 # dict of matrices ids, key: (src_type_id, dst_type_id), value: matrix_id self.edge_matrix_id = {} for type_ids, groups in self.type_relationship.items(): for dst_group in groups: for child_id in dst_group: for src_id in type_ids: self.edge_matrix_id[(src_id, child_id)] = count_diff_matrix count_diff_matrix += 1 self.U_f = nn.Parameter(torch.rand(count_diff_matrix, self.h_size, self.h_size), requires_grad=True)
def interactive(path_to_function: str, path_to_model: str): fix_seed() device = get_device() print(f"using {device} device") # convert function to dot format print(f"prepare ast...") create_folder(TMP_FOLDER) if not build_ast(path_to_function): return ast_folder = os.path.join(TMP_FOLDER, 'java', 'asts') ast = os.listdir(ast_folder) if len(ast) == 0: print("didn't find any functions in given file") return if len(ast) > 1: print( "too many functions in given file, for interactive prediction you need only one" ) return dgl_ast = convert_dot_to_dgl(os.path.join(ast_folder, ast[0])) ast_desc = pd.read_csv(os.path.join(TMP_FOLDER, 'java', 'description.csv')) ast_desc['token'].fillna('NAN', inplace=True) with open(vocab_path, 'rb') as pkl_file: vocab = pkl_load(pkl_file) token_to_id, type_to_id = vocab['token_to_id'], vocab['type_to_id'] ast_desc = transform_keys(ast_desc, token_to_id, type_to_id) batched_graph, labels, paths = prepare_batch(ast_desc, ['ast_0.dot'], lambda: [dgl_ast]) batched_graph = dgl.batch( list( map(lambda g: dgl.reverse(g, share_ndata=True), dgl.unbatch(batched_graph)))) # load model print("loading model..") model, _ = load_model(path_to_model, device) criterion = nn.CrossEntropyLoss( ignore_index=model.decoder.pad_index).to(device) info = LearningInfo() print("forward pass...") batch_info, prediction = eval_on_batch(model, criterion, batched_graph, labels, device) info.accumulate_info(batch_info) id_to_sublabel = {v: k for k, v in model.decoder.label_to_id.items()} label = '' for cur_sublabel in prediction: if cur_sublabel.item() == model.decoder.label_to_id[EOS]: break label += '|' + id_to_sublabel[cur_sublabel.item()] label = label[1:] print(f"Predicted function name is\n{label}") print( f"Calculated metrics with respect to '{labels[0]}' name\n{info.get_state_dict()}" )
def load(self, filename: str): """Load system state from .scad file. Parameters ---------- filename: str Name of file to load. """ with open(filename, 'rb') as f: state = pkl_load(f) if not isinstance(state, ProjectState): raise IncorrectTypeOfLoadedObject self._state = state self._history.clear() self._cancelled.clear() self._commit()
def remove_outliers(holdout_path: str, min_border: int, max_border: int) -> int: batches = os.listdir(holdout_path) removed = 0 for batch_path in tqdm(batches): with open(os.path.join(holdout_path, batch_path), 'rb') as pkl_file: batch = pkl_load(pkl_file) graphs = dgl.unbatch(batch['batched_graph']) labels = batch['labels'] paths = batch['paths'] orig_size = len(graphs) graphs, labels, paths = zip(*filter( lambda cur: min_border <= cur[0].number_of_nodes() <= max_border, zip(graphs, labels, paths) )) with open(os.path.join(holdout_path, batch_path), 'wb') as pkl_file: pkl_dump({'batched_graph': dgl.batch(graphs), 'labels': labels, 'paths': paths}, pkl_file) removed += orig_size - len(graphs) return removed
def __getitem__(self, item) -> Tuple[BatchedDGLGraph, List[str]]: batch_basename, batch_slice = self.batch_desc[item] # read file only if previous wasn't the same if self.loaded_batch_basename != batch_basename: with open(path_join(self.batched_graphs_path, batch_basename), 'rb') as pkl_file: self.loaded_batched_graph = pkl_load(pkl_file) self.loaded_batch_basename = batch_basename graphs = unbatch(self.loaded_batched_graph['batched_graph']) graphs_for_batch = graphs[batch_slice] if self.invert_edges: graphs_for_batch = list( map(lambda g: reverse(g, share_ndata=True), graphs_for_batch)) batched_graph = batch(graphs_for_batch) batched_labels = self.loaded_batched_graph['labels'][batch_slice] return batched_graph, batched_labels
def __init__(self, batched_graphs_path: str, batch_size: int, invert_edges: bool = False) -> None: self.batched_graphs_path = batched_graphs_path self.batch_size = batch_size self.invert_edges = invert_edges assert path_exists(self.batched_graphs_path) self.batched_graph_files = sorted(list( filter(lambda filename: filename.endswith('.pkl'), listdir(self.batched_graphs_path))), key=lambda name: int(name[6:-4])) self.batch_desc = {} self.n_batches = 0 self.loaded_batch_basename = None self.loaded_batched_graph = None # iterate over pkl files to aggregate information about batches print(f"prepare the {batched_graphs_path} dataset...") for batched_graph_file in tqdm(self.batched_graph_files): with open(path_join(self.batched_graphs_path, batched_graph_file), 'rb') as pkl_file: batched_graph = pkl_load(pkl_file) n_graphs = len(batched_graph['batched_graph'].batch_num_nodes) batches_per_file = n_graphs // self.batch_size + ( 1 if n_graphs % self.batch_size > 0 else 0) # collect information from the file for batch_id in range(batches_per_file): batch_slice = slice( batch_id * self.batch_size, min((batch_id + 1) * self.batch_size, n_graphs)) self.batch_desc[self.n_batches + batch_id] = (batched_graph_file, batch_slice) self.n_batches += batches_per_file
def file_feeder(self, last_feeder_file=None): file_list_path = self.dataPath() + '11\\' + 'coldstore_sorted.pkl' if exists(file_list_path): with open(file_list_path, 'rb') as pkl: file_list = pkl_load(pkl) if last_feeder_file != None: try: file_list = file_list[file_list.index(last_feeder_file ) + 1:] except: pass for file in file_list: self.compressor(file_path=file) self.status_updater(last_feeder_file=file) # remove(file_list_path) #Removes json of file list else: pass
def train(params: Dict, logging: str) -> None: fix_seed() device = get_device() print(f"using {device} device") training_set = JavaDataset(params['paths']['train'], params['batch_size'], True) validation_set = JavaDataset(params['paths']['validate'], params['batch_size'], True) with open(params['paths']['vocabulary'], 'rb') as pkl_file: vocabulary = pkl_load(pkl_file) token_to_id = vocabulary['token_to_id'] type_to_id = vocabulary['type_to_id'] label_to_id = vocabulary['label_to_id'] print('model initializing...') is_resumed = 'resume' in params if is_resumed: # load model model, checkpoint = load_model(params['resume'], device) start_batch_id = checkpoint['batch_id'] + 1 configuration = checkpoint['configuration'] else: # create model model_factory = ModelFactory(params['embedding'], params['encoder'], params['decoder'], params['hidden_states'], token_to_id, type_to_id, label_to_id) model: Tree2Seq = model_factory.construct_model(device) configuration = model_factory.save_configuration() start_batch_id = 0 # create optimizer optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay']) # create scheduler scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=params['scheduler_step_size'], gamma=params['scheduler_gamma']) # define loss function criterion = nn.CrossEntropyLoss( ignore_index=model.decoder.pad_index).to(device) # init logging class logger = None if logging == TerminalLogger.name: logger = TerminalLogger(params['checkpoints_folder']) elif logging == FileLogger.name: logger = FileLogger(params, params['logging_folder'], params['checkpoints_folder']) elif logging == WandBLogger.name: logger_args = ['treeLSTM', params, model, params['checkpoints_folder']] if 'resume_wandb_id' in params: logger_args.append(params['resume_wandb_id']) logger = WandBLogger(*logger_args) # train loop print("ok, let's train it") for epoch in range(params['n_epochs']): train_acc_info = LearningInfo() if epoch > 0: # specify start batch id only for first epoch start_batch_id = 0 tqdm_batch_iterator = tqdm(range(start_batch_id, len(training_set)), total=len(training_set)) tqdm_batch_iterator.update(start_batch_id) tqdm_batch_iterator.refresh() # iterate over training set for batch_id in tqdm_batch_iterator: graph, labels = training_set[batch_id] graph.ndata['token_id'] = graph.ndata['token_id'].to(device) graph.ndata['type_id'] = graph.ndata['type_id'].to(device) batch_info = train_on_batch(model, criterion, optimizer, scheduler, graph, labels, params, device) train_acc_info.accumulate_info(batch_info) # log current train process if is_current_step_match(batch_id, params['logging_step']): logger.log(train_acc_info.get_state_dict(), epoch, batch_id) train_acc_info = LearningInfo() # validate current model if is_current_step_match( batch_id, params['evaluation_step']) and batch_id != 0: eval_epoch_info = evaluate_dataset(validation_set, model, criterion, device) logger.log(eval_epoch_info.get_state_dict(), epoch, batch_id, False) # save current model if is_current_step_match( batch_id, params['checkpoint_step']) and batch_id != 0: logger.save_model(model, f'epoch_{epoch}_batch_{batch_id}.pt', configuration, batch_id=batch_id) logger.log(train_acc_info.get_state_dict(), epoch, len(training_set)) eval_epoch_info = evaluate_dataset(validation_set, model, criterion, device) logger.log(eval_epoch_info.get_state_dict(), epoch, len(training_set), False) logger.save_model(model, f'epoch_{epoch}.pt', configuration)
from sklearn.grid_search import GridSearchCV from preprocessor import transform, processes as lang_processes from build_committee import param_sets, prefs as bc_prefs, analyzer, classes, int_to_class application = flask.Flask(__name__) committee = {} for params in param_sets: committee[params] = {} votes = 1 for pref in bc_prefs: committee[params][pref] = (None, 0) try: clf_name = params + '.' + pref f = open('committee/' + clf_name + '.pkl') clf = pkl_load(f.read()) clf.estimator.steps[0][1].analyzer = analyzer clf.best_estimator_.steps[0][1].analyzer = analyzer committee[params][pref] = (clf, votes) print '[' + __name__ + ']\t' + clf_name + ' loaded' f.close() except IOError: pass params = {} prefs = ['ie', 'ns', 'ft', 'jp'] clf_types = ['svc', 'nb', 'knn'] doc_types = ['text', 'tweet'] classifiers = {} personalities = {} personalities_description = {}
def main(args: Namespace) -> None: dataset_name = dataset_mapping[args.dataset] data_path = os.path.join(data_folder, dataset_name) create_folder(data_folder, is_clean=False) create_folder(data_path, is_clean=False) if args.download: print(f"download {dataset_name} dataset...") tar_file_path = download_dataset(dataset_name, data_folder) print(f"extract files from tar archive {tar_file_path}...") train_path, val_path, test_path = extract_dataset(tar_file_path, data_folder, dataset_name) print("remove tar file...") os.remove(tar_file_path) else: train_path, val_path, test_path = [os.path.join(data_path, folder) for folder in holdout_folders] if args.build_ast: if not all([os.path.exists(holdout_path) for holdout_path in [train_path, val_path, test_path]]): raise RuntimeError("download and extract data before processing it via --download arg") if not os.path.exists(astminer_cli_path): raise RuntimeError(f"can't find astminer-cli in this location {astminer_cli_path}") holdout_ast_paths = {} for holdout in holdout_folders: holdout_ast_paths[holdout] = build_holdout_asts(data_path, holdout) else: holdout_ast_paths = { holdout: os.path.join(data_path, f'{holdout}_asts') for holdout in holdout_folders } vocabulary_path = os.path.join(data_path, vocabulary_name) if args.collect_vocabulary: token_to_id, type_to_id, label_to_id = collect_vocabulary(os.path.join(data_path, f'{holdout_folders[0]}_asts')) with open(vocabulary_path, 'wb') as pkl_file: pkl_dump({'token_to_id': token_to_id, 'type_to_id': type_to_id, 'label_to_id': label_to_id}, pkl_file) if args.convert: if not all([os.path.exists(path[1]) for path in holdout_ast_paths.items()]): raise RuntimeError("build ast before converting it via --build_ast arg") if not os.path.exists(vocabulary_path): raise RuntimeError("collect vocabulary before converting it via --build_ast arg") with open(vocabulary_path, 'rb') as pkl_file: pkl_data = pkl_load(pkl_file) token_to_id = pkl_data['token_to_id'] type_to_id = pkl_data['type_to_id'] holdout_preprocessed_paths = {} for holdout in holdout_folders: holdout_preprocessed_paths[holdout] = convert_holdout( data_path, holdout, token_to_id, type_to_id, args.n_jobs, args.batch_size, args.high_memory ) else: holdout_preprocessed_paths = { holdout: os.path.join(data_path, f'{holdout}_preprocessed') for holdout in holdout_folders } if args.remove_outliers: if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]): raise RuntimeError("convert ast before removing outliers via --convert arg") if args.min_outlier == -1 or args.max_outlier == -1: raise ValueError("specify a min and max border for removing outliers") removed = remove_outliers(holdout_preprocessed_paths[holdout_folders[0]], args.min_outlier, args.max_outlier) print(f"remove {removed} functions for training holdout") if args.upload: if not all([os.path.exists(path[1]) for path in holdout_preprocessed_paths.items()]): raise RuntimeError("convert ast before uploading using it via --convert arg") tar_file_name = f'{dataset_name}_{args.tar_suffix}.tar.gz' completed_process = subprocess_run( ['tar', '-czf', tar_file_name, vocabulary_name] + [f'{holdout}_preprocessed' for holdout in holdout_folders], cwd=data_path ) if completed_process.returncode != 0: print(f"can't create tar for preprocessed data, failed with\n{completed_process.stdout}") else: upload_file(os.path.join(data_path, tar_file_name), s3_bucket_name, tar_file_name) if args.download_preprocessed: for holdout, path in holdout_preprocessed_paths.items(): tar_file_name = f'{dataset_name}_{holdout}_preprocessed.tar.gz' tar_path = os.path.join(data_path, tar_file_name) download_file(tar_path, s3_bucket_name, tar_file_name) create_folder(path) extract_tar_gz(tar_path, path) vocabulary_path = os.path.join(data_path, vocabulary_name) download_file(vocabulary_path, s3_bucket_name, f'{dataset_name}_{vocabulary_name}') if all([os.path.exists(holdout_path) for _, holdout_path in holdout_preprocessed_paths.items()]): for holdout, path in holdout_preprocessed_paths.items(): number_of_batches = len(os.listdir(path)) print(f"There are {number_of_batches} batches in {holdout} data")
def load(self, pth): with gzip.open(pth, 'rb') as fh: return pkl_load(fh)
if done: break test_epoch_rewards.append(episode_acc_reward) # save most recent score if PROGRESS_LOG_STEP_FREQUENCY and episode_idx % PROGRESS_LOG_STEP_FREQUENCY == 0: print('\rEpisode {}\tAcc. Reward: {:.2f}\tEps: {:.3f}'.format( episode_idx, reward, agent.epsilon)) test_stats = DotMap( {'test_epoch_time': '{:.3f}'.format(time() - test_start_time)}) results_savetofilename = 'results/test' plot_rewards(saveto_filename=results_savetofilename, data=test_epoch_rewards, ylim=(-5, 25), dpi=320) if RESULTS_CONFIG.SAVE_REWARDS_PLOT: results_savetofilename = 'acc_rewards_01' acc_rewards = pkl_load( open('./results/' + results_savetofilename + '.p', 'rb')) plot_rewards(saveto_filename=results_savetofilename, data=acc_rewards, ylim=(-5, 25), dpi=320) env.close()
def search(self): drives = self.drive_finder() #list of all drives copy_path = self.dataPath() + '7\\' #address of folder 7 dbfile_path = self.dataPath( ) + '11\\coldstore_file.json' #address of coldstore json final_list = [] if exists(dbfile_path): #checks whether the path exists, for coldstore open_db = open(dbfile_path, 'rb') #open coldstore in read mode final_dict = pkl_load( open_db ) #load the pickle format of the coldstore to make it readable & save it in final_dict as it is a dictionar open_db.close() else: final_dict = {} # sets blank if file deleted or not created. for drive in drives: #loop through drives if GetDriveType( drive ) == DRIVE_FIXED: #check if drive is permanent or removable. for root, dirs, files in walk( drive, topdown=True ): #loop to walk through all the files and folders of a drive. if self.dataPath() not in root or root.split( "\\")[1] not in [ "Windows", "Program Files", "ProgramData", "Intel", "PrefLogs", "MSOCache", "Boot", "Recovery", "Python27", "$Recycle.Bin" ]: for file in files: #files contains the list of all the files in a directory, Dirs contain all the folders. target_path = copy_path + file #location where the file has to be formed. name, ext = splitext( file) #splits name and extention of a file. src_path = join( root, file ) #contains the address of the present file. src_mtime = getmtime( src_path ) #contains the epoch time of formation of the file. if ext in self.extensions or ext in self.special_extensions and src_path.__contains__( self.dataPath()): if src_path.__contains__(self.dataPath( )) and ext not in self.special_extensions: continue elif src_path not in final_dict or src_path in final_dict and src_mtime != final_dict[ src_path]: print(src_path) mod_time = getmtime(src_path) final_dict[src_path] = mod_time if exists(target_path) and isfile( target_path ) and self.md5_checksum( src_path ) == self.md5_checksum( target_path ): #checks if file that exists in target location and src location are same. continue elif exists(target_path) and isfile( target_path ) and self.md5_checksum( src_path ) != self.md5_checksum( target_path ): #checks if files are of same name but different content new_name = self.md5_checksum( src_path )[:8] + '__' + file #adds first 8bits of md5 checksum to the name if not exists( copy_path + new_name ): #checks if file does not exists already #check this part copy2( src_path, copy_path + new_name ) #imports the file to 7 folder else: continue else: if not exists(target_path): copy2(src_path, target_path) else: continue else: continue print(final_dict) jsn = open(dbfile_path, 'wb') pkl_dump(final_dict, jsn) jsn.close() temp_list = listdir(copy_path) for file in temp_list: filepath = copy_path + file final_list.append(filepath) final_list.sort(key=getmtime, reverse=True) with open(self.dataPath() + '11\\' + 'coldstore_sorted.pkl', 'wb') as pkl: pkl_dump(final_list, pkl) self.status_updater(search_flag=1, date_of_completion=time.time()) self.file_feeder()
def file_slicer(self, compressed_path, original_path): megabyte = int(pow(1024, 2)) counter = 1 file_name = basename(compressed_path) #Name of the compressed file file_size = getsize(compressed_path) #Size of the compressed file sliced_path = self.dataPath() + '8\\' #address of the sliced file sliced_db = self.dataPath( ) + '11\\cold_sliced.json' #json to store information about the sliced files chunk_dict = {} chunk_list = [] state_dict = {} if file_size > megabyte: #if file size more than 1 MB total = ceil(file_size / megabyte) #Total number of slices to be formed with open(compressed_path, 'rb') as file: file_data = file.read() #read compressed file while counter <= total: chunk = file_data[:megabyte] #!MB of chunk taken file_data = file_data[megabyte:] #1MB of chunk removed chunk_dir = sliced_path + file_name + '_' + str( total) + '\\' #Address of the folder of sliced files chunk_path = chunk_dir + file_name + '_' + str( counter) + '_' + str(total) chunk_list.append(chunk_path) if not exists(chunk_dir): makedirs(chunk_dir) current_modtime = getmtime( original_path) #stores last modified time state_dict[ original_path] = current_modtime #stores modified time with key of original path state_path = chunk_dir + '\\' + basename( original_path ) + '_state' #path of the json that stores information about the file with open(chunk_path, 'wb') as chunk_file: chunk_file.write(chunk) with open(state_path, 'wb') as state: pkl_dump(state_dict, state) counter += 1 if exists(sliced_db): with open(sliced_db, 'rb') as db: #opening sliced path chunk_dict = pkl_load(db) chunk_dict[ original_path] = chunk_list #adding info on the new file with open(sliced_db, 'wb') as sl: #writing the new file pkl_dump(chunk_dict, sl) for chunk in chunk_list: #sending chunk for encryption self.encryption(chunk, original_path, True) remove(chunk) else: self.encryption(compressed_path, original_path, False) remove(compressed_path)