def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" logger.info(f"Starting to load data from {path}.") self.load_metadata(path) # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = { DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST } if DataFold.TRAIN in folds_to_load: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train.jsonl.gz")) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data( path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def run_stats(graph_path: RichPath, output_path: RichPath): number_graphs, number_annotations, number_variables = 0, 0, 0 annotation_table = Counter() data_generator = chain( *(g.read_as_jsonl() for g in graph_path.iterate_filtered_files_in_dir('*.jsonl.gz'))) for data in data_generator: number_graphs += 1 if len(data['supernodes']) > 0 else 0 number_variables += len(data['supernodes']) number_annotations += sum( 1 for supernode in data['supernodes'].values() if supernode['annotation'] not in {None, 'None', 'Nothing', 'Any'}) annotation_table.update((supernode['annotation'] for supernode in data['supernodes'].values() if supernode['annotation'] not in {None, 'None', 'Nothing', 'Any'})) with open(output_path.to_local_path().path, "a") as f: f.write("Statistics for file: " + graph_path.to_local_path().path + "\n") f.write("Number of graphs: %d\n" % (number_graphs)) f.write("Number of variables: %d\n" % (number_variables)) f.write("Number of annotations: %d\n" % (number_annotations)) f.write("Number of different annotations: %d\n" % (len(list(annotation_table)))) f.write("\nFrequency distribution of annotations type:\n\n") for annotation, value in annotation_table.most_common(): f.write("%s\t%d\n" % (annotation, value))
def run(arguments, tag_in_vcs=False) -> RichPath: azure_info_path = arguments.get('--azure-info', None) train_folder = RichPath.create(arguments['TRAIN_DATA_PATH'], azure_info_path) valid_folder = RichPath.create(arguments['VALID_DATA_PATH'], azure_info_path) save_folder = arguments['SAVE_FOLDER'] assert train_folder.is_dir(), "%s is not a folder" % (train_folder, ) assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder, ) model_class = get_model_class_from_name(arguments.get('--model', 'nag')) hyperparameters = model_class.get_default_hyperparameters() hypers_override = arguments.get('--hypers-override') if hypers_override is not None: hyperparameters.update(json.loads(hypers_override)) hyperparameters['run_id'] = make_run_id(arguments) os.makedirs(save_folder, exist_ok=True) if tag_in_vcs: hyperparameters['git_commit'] = git_tag_run(hyperparameters['run_id']) return run_train(model_class, train_folder, valid_folder, save_folder, hyperparameters, arguments.get('--run-name'), arguments.get('--quiet', False))
def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" logger.info(f"Starting to load data from {path}.") if self.metadata == {}: metadata_path = path.join("metadata.pkl.gz") if metadata_path.exists(): logger.info(f"Loading metadata from {metadata_path}") self._metadata = metadata_path.read_by_file_suffix() else: logger.warning( "Using metadata passed to constructor, not metadata stored with data." ) # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = { DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST } if DataFold.TRAIN in folds_to_load: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train.jsonl.gz")) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data( path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def __init__(self, type_lattice_path: RichPath, bottom_symbol: str, built_in_aliases_path: RichPath): reference_lattice = type_lattice_path.read_as_json() id_aliases = {} self.__lattice_node_to_id = {} for i, n in enumerate(reference_lattice['nodes']): if n in self.__lattice_node_to_id: id_aliases[i] = self.__lattice_node_to_id[n] else: self.__lattice_node_to_id[n] = i assert self.__lattice_node_to_id[ bottom_symbol] == 0, 'We assume below that bottom is at position 0' self.__id_to_node = reference_lattice['nodes'] def alias_or_self(idx: int) -> int: if idx in id_aliases: return id_aliases[idx] else: return idx self.__aliases = dict( built_in_aliases_path.read_as_json()['aliasing_rules']) self.__is_a_edges = defaultdict(set) self.__child_edges = defaultdict(set) for from_id, to_id in reference_lattice['edges']: from_id = alias_or_self(from_id) to_id = alias_or_self(to_id) self.__is_a_edges[from_id].add(to_id) self.__child_edges[to_id].add(from_id) self._force_fix_lattice()
def run(args): azure_info_path = args.get('--azure-info', None) input_path = RichPath.create(args['INPUT_FILENAME'], azure_info_path) output_folder = RichPath.create(args['OUTPUT_FOLDER'], azure_info_path) train = float(args['--train-ratio']) valid = float(args['--valid-ratio']) test = float(args['--test-ratio']) holdout = float(args['--holdout-ratio']) # get data and process it df = jsonl_to_df(input_path) print('Removing fuzzy duplicates ... this may take some time.') df = remove_duplicate_code_df(df) df = df.sample(frac=1, random_state=20181026) # shuffle order of files df = label_folds(df, train_ratio=train, valid_ratio=valid, test_ratio=test, holdout_ratio=holdout) splits = ['train', 'valid', 'test', 'holdout'] for split in splits: split_df = df[df.partition == split] # save dataframes as chunked jsonl files jsonl_save_folder = output_folder.join(f'jsonl/{split}') print(f'Uploading data to {str(jsonl_save_folder)}') chunked_save_df_to_jsonl(split_df, jsonl_save_folder) # Upload dataframes to Azure filename = f'/tmp/{split}_df.pkl' df_save_path = output_folder.join(f'DataFrame/{split}_df.pkl') split_df.to_pickle(filename) print(f'Uploading data to {str(df_save_path)}') df_save_path.copy_from(RichPath.create(filename)) os.unlink(filename)
def load_data(self, path: RichPath) -> None: # Note that as __load_data produces a generator, we explicitly force loading # (and caching) here: self._loaded_data[DataFold.TRAIN] = \ list(self.__load_data(path.join("graphs-train"), DataFold.TRAIN)) self._loaded_data[DataFold.VALIDATION] = \ list(self.__load_data(path.join("graphs-valid"), DataFold.VALIDATION))
def test_copy_from(self): with self._setup_test() as az_info, TemporaryDirectory() as tmp_dir: elements = [[i, i//2] for i in range(10000)] tmp_local_path = RichPath.create(tmp_dir).join("sample.json.gz") tmp_local_path.save_as_compressed_file(elements) remote_path1 = RichPath.create(f"azure://devstoreaccount1/test1/sample1.json.gz", az_info) self.assertFalse(remote_path1.exists()) remote_path1.copy_from(tmp_local_path) tmp_local_path.delete() self.assertFalse(tmp_local_path.exists()) self.assertTrue(remote_path1.exists()) read_elements = remote_path1.read_by_file_suffix() self.assertListEqual(elements, read_elements) remote_path2 = RichPath.create(f"azure://devstoreaccount1/test1/sample2.json.gz", az_info) remote_path2.copy_from(remote_path1) remote_path1.delete() read_elements = remote_path2.read_by_file_suffix() self.assertListEqual(elements, read_elements) read_elements = remote_path2.to_local_path().read_by_file_suffix() self.assertListEqual(elements, read_elements) remote_path2.delete()
def get_staqc_dataset(path: RichPath) -> List[Dict[str, Any]]: codes = path.get_filtered_files_in_dir( 'python*qid_by*code.pickle')[0].read_as_pickle() titles = path.get_filtered_files_in_dir( 'python*qid_by*title.pickle')[0].read_as_pickle() data = chain([{ 'code': code, 'code_tokens': tokenize_python_from_string(code, func_only=False).code_tokens, 'docstring': titles[_id], 'docstring_tokens': tokenize_docstring_from_string(titles[_id]), 'language': 'python' } for _id, code in codes.items()]) filtered_data = filter_untokenizable_code(data) log_row_count_diff(original_data=codes.items(), filtered_data=filtered_data, label='StaQC') assert len( filtered_data ) > 0, 'No code tokens retrieved after applying filters for StaQC.' return filtered_data
def save(self, path: RichPath) -> None: """Save the model at a given location.""" with TemporaryDirectory() as tmpdir: target_file = os.path.join(tmpdir, 'model.pkl.gz') with gzip.open(target_file, 'wb') as f: torch.save(self, f) path.copy_from(RichPath.create(target_file))
def run_train(model_class: Type[Model], train_data_path: RichPath, valid_data_path: RichPath, save_folder: str, hyperparameters: Dict[str, Any], run_name: Optional[str]=None, quiet: bool=False) \ -> RichPath: train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*') valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*') model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder) if os.path.exists(model.model_save_path): model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True) model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps( hyperparameters))) resume = True else: model.load_existing_metadata(train_data_path.join('metadata.pkl.gz')) model.make_model(is_train=True) model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps(hyperparameters))) resume = False model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume) return model_path
def qm9_train_valid_paths(tmp_data_dir): """ 我们代码数据集的格式和qm9数据集的格式比较相似,可以按照qm9的格式进行处理 主要包含三部分的数据 图表示形式:点边点的三元组 节点的特征,使用one-hot编码 目标值(预测值) """ train_valid_paths = [ os.path.join(tmp_data_dir, f"{split}.jsonl.gz") for split in ["train", "valid"] ] data_samples = 5 * [{ "graph": [(0, 1, 1) ], # Edge between vertices 0 and 1, with type 1. 使用点边点的形式表示 "node_features": [[1, 0], [ 0, 1 ]], # Two nodes, with features of shape (2,). 使用one-hot形式编码的节点的特征 "targets": [[1.0] ], # Target value for the graph. 图结构需要预测的目标能量值,是一个32位的浮点数 }] for path in train_valid_paths: RichPath.create(path).save_as_compressed_file(data_samples) # 第一次调用时创建这些临时文件,第二次调用时删除它们 yield train_valid_paths for path in train_valid_paths: os.remove(path)
def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" logger.info(f"Starting to load data from {path}.") # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = {DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST} if DataFold.TRAIN in folds_to_load: data_file = path.join("train.jsonl.gz") self.vocab_source = self._build_vocab( dataset = [datapoint["graph"]["node_features"] for datapoint in data_file.read_by_file_suffix()], vocab_size=self.max_vocab_size ) self.vocab_target = self._build_vocab( dataset = [datapoint["Target"] for datapoint in data_file.read_by_file_suffix()], vocab_size=self.max_vocab_size ) self._loaded_data[DataFold.TRAIN] = self.__load_data(data_file) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data(path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data(path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def __load_data(self, data_dir: RichPath, data_fold: DataFold) -> List[GraphSample]: if data_fold == None: data_fold = "train" if data_fold == DataFold.TRAIN: data_name = "train" elif data_fold == DataFold.VALIDATION: data_name = "valid" elif data_fold == DataFold.TEST: data_name = "test" else: raise ValueError("Unknown data fold '%s'" % str(data_fold)) print(" Loading DEOBFUSCATION %s data from %s." % (data_name, data_dir)) if data_dir.join("%s-saved.pkl.gz" % data_name).is_file(): read_data = data_dir.join("%s-saved.pkl.gz" % data_name).read_by_file_suffix() return read_data["all_graphs"], read_data["properties"] all_untensorised = data_dir.join("%s.pkl.gz" % data_name).read_by_file_suffix() graphs = all_untensorised["graphs"] properties = dict() properties["all_user_nodes"] = all_untensorised["name_to_id_mapping"] properties["user_defined_nodes_number"] = all_untensorised[ "total_user_defined_nodes"] properties["edge_mapping"] = all_untensorised["edge_name_to_id"] properties["__num_labels"] = len(properties["all_user_nodes"]) properties["__num_edge_types"] = len(properties["edge_mapping"]) properties["__num_types"] = len(all_untensorised["type_to_id"]) all_graphs = [] for i in tqdm(range(len(graphs))): old_graph = graphs[i] if (old_graph["user_defined_nodes_number"] > 0): all_graphs.append( self.create_graph_sample( old_graph, properties["__num_edge_types"], len(properties["all_user_nodes"]))) print_graph_number = 2500 print([ all_untensorised["ids_to_names"][x] for x in all_graphs[print_graph_number].labels ], all_graphs[print_graph_number].nodes_mask) print(all_graphs[print_graph_number]) to_save = dict() to_save["all_graphs"] = all_graphs to_save["properties"] = properties data_dir.join("%s-saved.pkl.gz" % data_name).save_as_compressed_file(to_save) print("Saved modified data to %s-saved.pkl.gz" % data_name) return all_graphs, properties
def jsonl_to_df(input_folder: RichPath, sample_percent: float, files_remaining: dict, azure_info_path) -> pd.DataFrame: "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ." assert input_folder.is_dir(), 'Argument supplied must be a directory' dfs = [] all_files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz')) sample_size = math.ceil(sample_percent * len(all_files)) if sample_size > len(files_remaining): sample_size = len(files_remaining) files = random.sample(files_remaining.keys(), sample_size) replaced = [0 for x in range(sample_size)] while True: for i in range(len(files)): f = files[i] other_files = {x for x in files if x != f} if f not in files_remaining or len(set.intersection(files_remaining[f], other_files)) == len(files) - 1: replaced[i] = 1 f = random.sample(files_remaining.keys(), 1) while f[0] in files: f = random.sample(files_remaining.keys(), 1) files[i] = f[0] else: replaced[i] = 0 if sum(replaced) < 2: break for f in files: files_remaining[f] = files_remaining[f].union({x for x in files if x != f}) if len(files_remaining[f]) == len(all_files) - 1: del files_remaining[f] with open('files_remaining.txt', 'w+') as f: files_remaining_converted = {} for path in files_remaining: files_remaining_converted[path] = list(files_remaining[path]) print(json.dumps(files_remaining_converted), file = f) assert files, 'There were no jsonl.gz files in the specified directory.' print(f'reading files from {input_folder.path}') project_map = {x:[] for x in files} print(project_map) for f in tqdm(files, total=len(files)): rich_f = RichPath.create(f, azure_info_path) lines = list(rich_f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}'))) lines_with_docstrings = [] for line in lines: if len(line['docstring_tokens']) > 0: lines_with_docstrings.append(line) if line['nwo'] not in project_map[str(rich_f)]: project_map[str(rich_f)].append(line['nwo']) dfs.append(pd.DataFrame(lines_with_docstrings)) return pd.concat(dfs), project_map
def run(arguments): azure_info_path = arguments.get('--azure-info', None) test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path) model_path = RichPath.create(arguments['MODEL_PATH']) type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'], azure_info_path) alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA'], azure_info_path) run_test(model_path, test_folder, type_lattice_path, alias_metadata_path, arguments['--print-predictions'])
def df_to_jsonl(df: pd.DataFrame, RichPath_obj: RichPath, i: int, basefilename='codedata') -> str: dest_filename = f'{basefilename}_{str(i).zfill(5)}.jsonl.gz' RichPath_obj.join(dest_filename).save_as_compressed_file( df.to_dict(orient='records')) return str(RichPath_obj.join(dest_filename))
def run(arguments): azure_info_path = arguments.get('--azure-info', None) test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path) model_path = RichPath.create(arguments['MODEL_PATH']) output_folder = arguments['OUTPUT_FOLDER'] os.makedirs(output_folder, exist_ok=True) num_processes = int(arguments['--num-processes']) run_test(model_path, test_folder, output_folder, num_processes)
def run(arguments): azure_info_path = arguments.get('--azure-info', None) predictions_path = RichPath.create(arguments['PREDICTIONS_JSONL_GZ'], azure_info_path) type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'], azure_info_path) alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA_PATH'], azure_info_path) compute(predictions_path, type_lattice_path, alias_metadata_path)
def test_simple_read_write(self): with self._setup_test() as az_info: remote_path = RichPath.create( "azure://devstoreaccount1/test1/remote_path.txt", az_info) with TemporaryDirectory() as tmp_dir: data_f = os.path.join(tmp_dir, 'testdata.txt') with open(data_f, 'w') as f: f.write("hello!") local_path = RichPath.create(data_f) self.assertEqual(local_path.read_as_text(), "hello!") local_size = local_path.get_size() remote_path.copy_from(local_path) self.assertTrue(local_path.exists()) local_path.delete() self.assertFalse(local_path.exists()) local_path.delete() with self.assertRaises(Exception): local_path.delete(missing_ok=False) self.assertEqual(remote_path.read_as_text(), "hello!") # Read once again (should trigger cache) self.assertEqual(remote_path.read_as_text(), "hello!") self.assertTrue(remote_path.exists()) self.assertTrue(remote_path.is_file()) self.assertFalse(remote_path.is_dir()) self.assertEqual(local_size, remote_path.get_size()) local_path = remote_path.to_local_path() self.assertTrue(local_path.exists()) os.path.exists(local_path.path) with open(local_path.path, 'r') as f: self.assertEqual(f.read(), "hello!") # Delete file remote_path.delete() self.assertFalse(remote_path.exists()) remote_path.delete() # Should not raise Exception with self.assertRaises(FileNotFoundError): remote_path.delete(missing_ok=False) # Other random remote_path does not exist remote_path = RichPath.create( "azure://devstoreaccount1/test1/remote_path2.txt", az_info) self.assertFalse(remote_path.exists()) self.assertFalse(remote_path.is_dir()) self.assertFalse(remote_path.is_file()) with self.assertRaises(Exception): remote_path.read_as_text() with self.assertRaises(Exception): remote_path.get_size()
def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame: "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ." assert input_folder.is_dir(), 'Argument supplied must be a directory' dfs = [] files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz')) assert files, 'There were no jsonl.gz files in the specified directory.' print(f'reading files from {input_folder.path}') for f in tqdm(files, total=len(files)): dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}'))))) return pd.concat(dfs)
def parse_jsonl_file(self, file: Path, output_folder: Path) -> None: print(f"Parsing data in {file}") input_file = RichPath.create(str(file)) output_file = RichPath.create( str(output_folder / input_file.basename())) parsed_code = [ self.process_data(self.extract_from_raw_data(raw_json_object)) for raw_json_object in tqdm(input_file.read_by_file_suffix()) ] print(f"Saving processed data in {output_file}") output_file.save_as_compressed_file(parsed_code)
def run(arguments): azure_info_path = arguments.get('--azure-info', None) type_lattice_file = RichPath.create(arguments['TYPE_LATTICE_FILE'], azure_info_path) output_folder = RichPath.create(arguments['OUTPUT_BASE_FOLDER'], azure_info_path) input_folders, input_folder_basenames = [], set() print(arguments['INPUT_FOLDER']) print(arguments['TYPE_LATTICE_FILE']) print(arguments['OUTPUT_BASE_FOLDER']) for input_folder_name in arguments['INPUT_FOLDER']: input_folder_basename = os.path.basename(input_folder_name) if input_folder_basename in input_folder_basenames: raise ValueError("Several input folders with same basename '%s'!" % (input_folder_basename, )) input_folder_basenames.add(input_folder_basename) input_folder = RichPath.create(input_folder_name) assert input_folder.is_dir(), "%s is not a folder" % (input_folder, ) input_folders.append(input_folder) model_class = get_model_class_from_name(arguments.get('--model', 'nag')) hyperparameters = model_class.get_default_hyperparameters() hypers_override = arguments.get('--hypers-override') if hypers_override is not None: hyperparameters.update(json.loads(hypers_override)) model = model_class(hyperparameters, run_name=arguments.get('--run-name')) metadata_to_use = arguments.get('--metadata-to-use', None) if metadata_to_use is None: train_folder = input_folders[0] model.load_metadata(train_folder, type_lattice_file, max_num_files=int(arguments['--max-num-files'])) else: metadata_path = RichPath.create(metadata_to_use, azure_info_path) model.load_existing_metadata(metadata_path) for_test = args.get('--for-test', False) model.make_model(is_train=not for_test) for input_folder in input_folders: input_folder_basename = input_folder.basename() this_output_folder = output_folder.join(input_folder_basename) this_output_folder.make_as_dir() model.tensorise_data_in_dir(input_folder, this_output_folder, for_test=for_test, max_num_files=int( arguments['--max-num-files']))
def test_connection_types(self): for auth_type in AuthType: with self.subTest(f"Test {auth_type}"), self._setup_test(auth_type=auth_type) as az_info, TemporaryDirectory() as tmp_dir: data_f = os.path.join(tmp_dir, 'testtext.txt') with open(data_f, 'w') as f: f.write("hello!") local_path = RichPath.create(data_f) remote_path = RichPath.create("azure://devstoreaccount1/test1/test_text.txt", az_info) remote_path.copy_from(local_path) local_path.delete() self.assertEqual(remote_path.read_as_text(), "hello!") remote_path.delete()
def start(train_data_path, valid_folder_path, save_folder_path, model_class): train_folder = RichPath.create(train_data_path) valid_folder = RichPath.create(valid_folder_path) os.makedirs(save_folder_path, exist_ok=True) assert train_folder.is_dir(), "%s is not a folder" % (train_folder,) assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder,) model_class = get_model_class_from_name(model_class) hyperparameters = model_class.get_default_hyperparameters() hyperparameters['run_id'] = "%s-%s" % (model_class.__class__.__name__, time.strftime("%Y-%m-%d-%H-%M-%S")) return run_train(model_class, train_folder, valid_folder, save_folder_path, hyperparameters)
def save(self, path: RichPath) -> None: variables_to_save = list(set(self.__sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) weights_to_save = self.__sess.run(variables_to_save) weights_to_save = {var.name: value for (var, value) in zip(variables_to_save, weights_to_save)} data_to_save = { "model_type": type(self).__name__, "hyperparameters": self.hyperparameters, "metadata": self.__metadata, "weights": weights_to_save, "run_name": self.__run_name, } path.save_as_compressed_file(data_to_save)
def test_code_data_load(): dataset_params = CodeDataset.get_default_hyperparameters() dataset = CodeDataset(dataset_params) dataset.load_data(RichPath.create(CODE_DATA_PATH), folds_to_load={DataFold.TRAIN}) for data_point in dataset._graph_iterator(DataFold.TRAIN): print(data_point.adjacency_lists)
def run_export(model_path: RichPath, test_data_path: RichPath, output_folder: str): test_hyper_overrides = { 'run_id': 'exporting', "dropout_keep_rate": 1.0, } data_chunks = test_data_path.get_filtered_files_in_dir('*gz') # Restore model model = model_restore_helper.restore(model_path, is_train=False, hyper_overrides=test_hyper_overrides) exporting = model.export_representations(data_chunks) os.makedirs(output_folder, exist_ok=True) with open(os.path.join(output_folder, 'vectors.tsv'), 'w') as vectors_file,\ open(os.path.join(output_folder, 'metadata.tsv'), 'w') as metadata_file: metadata_file.write('varname\ttype\tkind\tprovenance\n') for annot in exporting: metadata_file.write( f'{assert_valid_str(annot.name)}\t{assert_valid_str(annot.type_annotation)}\t{assert_valid_str(annot.kind)}\t{assert_valid_str(annot.provenance)}\n' ) vectors_file.write('\t'.join(str(e) for e in annot.representation)) vectors_file.write('\n')
def run(arguments): azure_info_path = arguments.get('--azure-info', None) input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path) output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path) with ChunkWriter(output_folder, file_prefix='codedata', max_chunk_size=500, file_suffix='.jsonl.gz') as chunked_writer: for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'): for line in file.read_by_file_suffix(): tokens = line['code_tokens'] chunked_writer.add( dict(filename='%s:%s:%s' % (line['repo'], line['path'], line['lineno']), tokens=tokens))
def jsonl_test_case(): """ 加载data/test_datasets/..下的train.jsonl.gz文件和valid.jsonl.gz文件 """ dataset_params = JsonLGraphPropertyDataset.get_default_hyperparameters() dataset_params["num_fwd_edge_types"] = 4 dataset = JsonLGraphPropertyDataset(dataset_params) data_path = RichPath.create( os.path.join(os.path.dirname(__file__), "..", "test_datasets")) dataset.load_data(data_path, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}) return TestCase( dataset=dataset, expected=TestExpectedValues( num_edge_types=dataset_params["num_fwd_edge_types"] + 1, node_feature_shape=(35, ), num_train_samples=10, num_valid_samples=10, labels_key_name="target_value", add_self_loop_edges=dataset_params["add_self_loop_edges"], tie_fwd_bkwd_edges=dataset_params["tie_fwd_bkwd_edges"], self_loop_edge_type=0, ), )