def run(args): azure_info_path = args.get('--azure-info', None) input_path = RichPath.create(args['INPUT_FILENAME'], azure_info_path) output_folder = RichPath.create(args['OUTPUT_FOLDER'], azure_info_path) train = float(args['--train-ratio']) valid = float(args['--valid-ratio']) test = float(args['--test-ratio']) holdout = float(args['--holdout-ratio']) # get data and process it df = jsonl_to_df(input_path) print('Removing fuzzy duplicates ... this may take some time.') df = remove_duplicate_code_df(df) df = df.sample(frac=1, random_state=20181026) # shuffle order of files df = label_folds(df, train_ratio=train, valid_ratio=valid, test_ratio=test, holdout_ratio=holdout) splits = ['train', 'valid', 'test', 'holdout'] for split in splits: split_df = df[df.partition == split] # save dataframes as chunked jsonl files jsonl_save_folder = output_folder.join(f'jsonl/{split}') print(f'Uploading data to {str(jsonl_save_folder)}') chunked_save_df_to_jsonl(split_df, jsonl_save_folder) # Upload dataframes to Azure filename = f'/tmp/{split}_df.pkl' df_save_path = output_folder.join(f'DataFrame/{split}_df.pkl') split_df.to_pickle(filename) print(f'Uploading data to {str(df_save_path)}') df_save_path.copy_from(RichPath.create(filename)) os.unlink(filename)
def qm9_train_valid_paths(tmp_data_dir): """ 我们代码数据集的格式和qm9数据集的格式比较相似,可以按照qm9的格式进行处理 主要包含三部分的数据 图表示形式:点边点的三元组 节点的特征,使用one-hot编码 目标值(预测值) """ train_valid_paths = [ os.path.join(tmp_data_dir, f"{split}.jsonl.gz") for split in ["train", "valid"] ] data_samples = 5 * [{ "graph": [(0, 1, 1) ], # Edge between vertices 0 and 1, with type 1. 使用点边点的形式表示 "node_features": [[1, 0], [ 0, 1 ]], # Two nodes, with features of shape (2,). 使用one-hot形式编码的节点的特征 "targets": [[1.0] ], # Target value for the graph. 图结构需要预测的目标能量值,是一个32位的浮点数 }] for path in train_valid_paths: RichPath.create(path).save_as_compressed_file(data_samples) # 第一次调用时创建这些临时文件,第二次调用时删除它们 yield train_valid_paths for path in train_valid_paths: os.remove(path)
def run(arguments, tag_in_vcs=False) -> RichPath: azure_info_path = arguments.get('--azure-info', None) train_folder = RichPath.create(arguments['TRAIN_DATA_PATH'], azure_info_path) valid_folder = RichPath.create(arguments['VALID_DATA_PATH'], azure_info_path) save_folder = arguments['SAVE_FOLDER'] assert train_folder.is_dir(), "%s is not a folder" % (train_folder, ) assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder, ) model_class = get_model_class_from_name(arguments.get('--model', 'nag')) hyperparameters = model_class.get_default_hyperparameters() hypers_override = arguments.get('--hypers-override') if hypers_override is not None: hyperparameters.update(json.loads(hypers_override)) hyperparameters['run_id'] = make_run_id(arguments) os.makedirs(save_folder, exist_ok=True) if tag_in_vcs: hyperparameters['git_commit'] = git_tag_run(hyperparameters['run_id']) return run_train(model_class, train_folder, valid_folder, save_folder, hyperparameters, arguments.get('--run-name'), arguments.get('--quiet', False))
def test_copy_from(self): with self._setup_test() as az_info, TemporaryDirectory() as tmp_dir: elements = [[i, i//2] for i in range(10000)] tmp_local_path = RichPath.create(tmp_dir).join("sample.json.gz") tmp_local_path.save_as_compressed_file(elements) remote_path1 = RichPath.create(f"azure://devstoreaccount1/test1/sample1.json.gz", az_info) self.assertFalse(remote_path1.exists()) remote_path1.copy_from(tmp_local_path) tmp_local_path.delete() self.assertFalse(tmp_local_path.exists()) self.assertTrue(remote_path1.exists()) read_elements = remote_path1.read_by_file_suffix() self.assertListEqual(elements, read_elements) remote_path2 = RichPath.create(f"azure://devstoreaccount1/test1/sample2.json.gz", az_info) remote_path2.copy_from(remote_path1) remote_path1.delete() read_elements = remote_path2.read_by_file_suffix() self.assertListEqual(elements, read_elements) read_elements = remote_path2.to_local_path().read_by_file_suffix() self.assertListEqual(elements, read_elements) remote_path2.delete()
def run(arguments): azure_info_path = arguments.get('--azure-info', None) test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path) model_path = RichPath.create(arguments['MODEL_PATH']) output_folder = arguments['OUTPUT_FOLDER'] os.makedirs(output_folder, exist_ok=True) num_processes = int(arguments['--num-processes']) run_test(model_path, test_folder, output_folder, num_processes)
def run(arguments): azure_info_path = arguments.get('--azure-info', None) test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path) model_path = RichPath.create(arguments['MODEL_PATH']) type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'], azure_info_path) alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA'], azure_info_path) run_test(model_path, test_folder, type_lattice_path, alias_metadata_path, arguments['--print-predictions'])
def run(arguments): azure_info_path = arguments.get('--azure-info', None) predictions_path = RichPath.create(arguments['PREDICTIONS_JSONL_GZ'], azure_info_path) type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'], azure_info_path) alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA_PATH'], azure_info_path) compute(predictions_path, type_lattice_path, alias_metadata_path)
def test_simple_read_write(self): with self._setup_test() as az_info: remote_path = RichPath.create( "azure://devstoreaccount1/test1/remote_path.txt", az_info) with TemporaryDirectory() as tmp_dir: data_f = os.path.join(tmp_dir, 'testdata.txt') with open(data_f, 'w') as f: f.write("hello!") local_path = RichPath.create(data_f) self.assertEqual(local_path.read_as_text(), "hello!") local_size = local_path.get_size() remote_path.copy_from(local_path) self.assertTrue(local_path.exists()) local_path.delete() self.assertFalse(local_path.exists()) local_path.delete() with self.assertRaises(Exception): local_path.delete(missing_ok=False) self.assertEqual(remote_path.read_as_text(), "hello!") # Read once again (should trigger cache) self.assertEqual(remote_path.read_as_text(), "hello!") self.assertTrue(remote_path.exists()) self.assertTrue(remote_path.is_file()) self.assertFalse(remote_path.is_dir()) self.assertEqual(local_size, remote_path.get_size()) local_path = remote_path.to_local_path() self.assertTrue(local_path.exists()) os.path.exists(local_path.path) with open(local_path.path, 'r') as f: self.assertEqual(f.read(), "hello!") # Delete file remote_path.delete() self.assertFalse(remote_path.exists()) remote_path.delete() # Should not raise Exception with self.assertRaises(FileNotFoundError): remote_path.delete(missing_ok=False) # Other random remote_path does not exist remote_path = RichPath.create( "azure://devstoreaccount1/test1/remote_path2.txt", az_info) self.assertFalse(remote_path.exists()) self.assertFalse(remote_path.is_dir()) self.assertFalse(remote_path.is_file()) with self.assertRaises(Exception): remote_path.read_as_text() with self.assertRaises(Exception): remote_path.get_size()
def parse_jsonl_file(self, file: Path, output_folder: Path) -> None: print(f"Parsing data in {file}") input_file = RichPath.create(str(file)) output_file = RichPath.create( str(output_folder / input_file.basename())) parsed_code = [ self.process_data(self.extract_from_raw_data(raw_json_object)) for raw_json_object in tqdm(input_file.read_by_file_suffix()) ] print(f"Saving processed data in {output_file}") output_file.save_as_compressed_file(parsed_code)
def run(arguments): azure_info_path = arguments.get('--azure-info', None) type_lattice_file = RichPath.create(arguments['TYPE_LATTICE_FILE'], azure_info_path) output_folder = RichPath.create(arguments['OUTPUT_BASE_FOLDER'], azure_info_path) input_folders, input_folder_basenames = [], set() print(arguments['INPUT_FOLDER']) print(arguments['TYPE_LATTICE_FILE']) print(arguments['OUTPUT_BASE_FOLDER']) for input_folder_name in arguments['INPUT_FOLDER']: input_folder_basename = os.path.basename(input_folder_name) if input_folder_basename in input_folder_basenames: raise ValueError("Several input folders with same basename '%s'!" % (input_folder_basename, )) input_folder_basenames.add(input_folder_basename) input_folder = RichPath.create(input_folder_name) assert input_folder.is_dir(), "%s is not a folder" % (input_folder, ) input_folders.append(input_folder) model_class = get_model_class_from_name(arguments.get('--model', 'nag')) hyperparameters = model_class.get_default_hyperparameters() hypers_override = arguments.get('--hypers-override') if hypers_override is not None: hyperparameters.update(json.loads(hypers_override)) model = model_class(hyperparameters, run_name=arguments.get('--run-name')) metadata_to_use = arguments.get('--metadata-to-use', None) if metadata_to_use is None: train_folder = input_folders[0] model.load_metadata(train_folder, type_lattice_file, max_num_files=int(arguments['--max-num-files'])) else: metadata_path = RichPath.create(metadata_to_use, azure_info_path) model.load_existing_metadata(metadata_path) for_test = args.get('--for-test', False) model.make_model(is_train=not for_test) for input_folder in input_folders: input_folder_basename = input_folder.basename() this_output_folder = output_folder.join(input_folder_basename) this_output_folder.make_as_dir() model.tensorise_data_in_dir(input_folder, this_output_folder, for_test=for_test, max_num_files=int( arguments['--max-num-files']))
def start(train_data_path, valid_folder_path, save_folder_path, model_class): train_folder = RichPath.create(train_data_path) valid_folder = RichPath.create(valid_folder_path) os.makedirs(save_folder_path, exist_ok=True) assert train_folder.is_dir(), "%s is not a folder" % (train_folder,) assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder,) model_class = get_model_class_from_name(model_class) hyperparameters = model_class.get_default_hyperparameters() hyperparameters['run_id'] = "%s-%s" % (model_class.__class__.__name__, time.strftime("%Y-%m-%d-%H-%M-%S")) return run_train(model_class, train_folder, valid_folder, save_folder_path, hyperparameters)
def test_connection_types(self): for auth_type in AuthType: with self.subTest(f"Test {auth_type}"), self._setup_test(auth_type=auth_type) as az_info, TemporaryDirectory() as tmp_dir: data_f = os.path.join(tmp_dir, 'testtext.txt') with open(data_f, 'w') as f: f.write("hello!") local_path = RichPath.create(data_f) remote_path = RichPath.create("azure://devstoreaccount1/test1/test_text.txt", az_info) remote_path.copy_from(local_path) local_path.delete() self.assertEqual(remote_path.read_as_text(), "hello!") remote_path.delete()
def run(arguments) -> None: print("Loading data ...") dataset_params = JsonLMethod2CommentDataset.get_default_hyperparameters() dataset = JsonLMethod2CommentDataset(dataset_params) data_description = dataset.get_batch_tf_data_description() model = LanguageModel.restore(arguments["TRAINED_MODEL"], data_description.batch_features_shapes) print(f" Loaded trained model from {arguments['TRAINED_MODEL']}.") dataset.load_vocab(model.vocab_source, model.vocab_target) data_path = RichPath.create( os.path.join(os.path.dirname(__file__), ".", "datasets/" + args['TEST_DATA_DIR']) ) dataset.load_data(data_path, folds_to_load=[DataFold.VALIDATION]) test_data = dataset.get_tensorflow_dataset(DataFold.VALIDATION) print( f" Loaded {len(list(test_data))} testing samples." ) test_loss, test_acc, test_true, test_pred = model.run_one_epoch( test_data, training=False, ) test_bleu, test_nist, test_dist, test_rouge2, test_rougel = calculate_metrics(test_true, test_pred) print(f"Test: Loss {test_loss:.4f}, Acc {test_acc:.3f}, BLEU {test_bleu:.3f}") print(f" NIST {test_nist:.3f}, DIST {test_dist:.3f}, ROUGE-2 {test_rouge2:.3f}, ROUGE-L {test_rougel:.3f}")
def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None: """Load the data from disk.""" if path is None: path = RichPath.create(self.default_data_directory()) logger.info("Starting to load data.") # If we haven't defined what folds to load, load all: if folds_to_load is None: folds_to_load = { DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST } if DataFold.TRAIN in folds_to_load: self._loaded_data[DataFold.TRAIN] = self.__load_data( path.join("train.jsonl.gz")) logger.debug("Done loading training data.") if DataFold.VALIDATION in folds_to_load: self._loaded_data[DataFold.VALIDATION] = self.__load_data( path.join("valid.jsonl.gz")) logger.debug("Done loading validation data.") if DataFold.TEST in folds_to_load: self._loaded_data[DataFold.TEST] = self.__load_data( path.join("test.jsonl.gz")) logger.debug("Done loading test data.")
def run(arguments): azure_info_path = arguments.get('--azure-info', None) input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path) output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path) with ChunkWriter(output_folder, file_prefix='codedata', max_chunk_size=500, file_suffix='.jsonl.gz') as chunked_writer: for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'): for line in file.read_by_file_suffix(): tokens = line['code_tokens'] chunked_writer.add( dict(filename='%s:%s:%s' % (line['repo'], line['path'], line['lineno']), tokens=tokens))
def train(self, train_data: List[RichPath], valid_data: List[RichPath], quiet: bool=False, resume: bool=False) -> RichPath: model_path = RichPath.create(self.model_save_path) with self.__sess.as_default(): tf.set_random_seed(self.hyperparameters['seed']) if resume: # Variables should have been restored. best_val_loss = self.__run_epoch_in_batches(valid_data, "RESUME (valid)", is_train=False, quiet=quiet) self.train_log('Validation Loss on Resume: %.6f' % (best_val_loss,)) else: init_op = tf.variables_initializer(self.__sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.__sess.run(init_op) self.save(model_path) best_val_loss = float("inf") no_improvement_counter = 0 epoch_number = 0 while (epoch_number < self.hyperparameters['max_epochs'] and no_improvement_counter < self.hyperparameters['patience']): self.train_log('==== Epoch %i ====' % (epoch_number,)) train_loss = self.__run_epoch_in_batches(train_data, "%i (train)" % (epoch_number,), is_train=True, quiet=quiet) self.train_log(' Training Loss: %.6f' % (train_loss,)) val_loss = self.__run_epoch_in_batches(valid_data, "%i (valid)" % (epoch_number,), is_train=False, quiet=quiet) self.train_log(' Validation Loss: %.6f' % (val_loss,)) if val_loss < best_val_loss: best_val_loss = val_loss no_improvement_counter = 0 self.save(model_path) self.train_log(" Best result so far -- saving model as '%s'." % (model_path,)) else: no_improvement_counter += 1 epoch_number += 1 return model_path
def test(model_path: str, test_data_path: Optional[RichPath], result_dir: str, quiet: bool = False, run_id: str = None): model = restore(model_path, result_dir, run_id) model.params['max_nodes_in_batch'] = 2 * model.params['max_nodes_in_batch'] # We can process larger batches if we don't do training test_data_path = test_data_path or RichPath.create(model.task.default_data_path()) model.log_line(" Using the following task params: %s" % json.dumps(model.task.params)) model.log_line(" Using the following model params: %s" % json.dumps(model.params)) model.test(test_data_path)
def jsonl_test_case(): """ 加载data/test_datasets/..下的train.jsonl.gz文件和valid.jsonl.gz文件 """ dataset_params = JsonLGraphPropertyDataset.get_default_hyperparameters() dataset_params["num_fwd_edge_types"] = 4 dataset = JsonLGraphPropertyDataset(dataset_params) data_path = RichPath.create( os.path.join(os.path.dirname(__file__), "..", "test_datasets")) dataset.load_data(data_path, folds_to_load={DataFold.TRAIN, DataFold.VALIDATION}) return TestCase( dataset=dataset, expected=TestExpectedValues( num_edge_types=dataset_params["num_fwd_edge_types"] + 1, node_feature_shape=(35, ), num_train_samples=10, num_valid_samples=10, labels_key_name="target_value", add_self_loop_edges=dataset_params["add_self_loop_edges"], tie_fwd_bkwd_edges=dataset_params["tie_fwd_bkwd_edges"], self_loop_edge_type=0, ), )
def test_train_code_model(): data_path = RichPath.create(CODE_FILE_PATH) # 根据参数获得模型和数据集 dataset, model = get_model_and_dataset( msg_passing_implementation='RGCN', data_path=data_path, task_name='Code', cli_data_hyperparameter_overrides=None, cli_model_hyperparameter_overrides=None, hyperdrive_hyperparameter_overrides={}, folds_to_load={DataFold.TRAIN}, load_weights_only=False, disable_tf_function_build=False, trained_model_file=None) def log(msg): log_line(LOG_FILE_PATH, msg) trained_model_path = train(model, dataset, log_fun=log, run_id='1', max_epochs=10, patience=1, save_dir=SAVE_MODEL_PATH, quiet=False, aml_run=None) print(trained_model_path)
def test_train_model(self): num_features = 100 training_data, validation_data = self.__get_data(num_features) with tempfile.TemporaryDirectory() as dir: model_file = RichPath.create(dir).join('tmp.pkl.gz') model = SimpleRegression('SimpleRegressionTest', num_features) trainer = ComponentTrainer(model, model_file, max_num_epochs=50) trainer.train(training_data, validation_data, parallel_minibatch_creation=True) model_acc_1 = self.__compute_accuracy(model, validation_data) trained_model = SimpleRegression.restore_model( model_file) # type: SimpleRegression trained_model_acc = self.__compute_accuracy( trained_model, validation_data) self.assertGreater( trained_model_acc, .95, f'Model achieves too low accuracy, {trained_model_acc:%}') self.assertAlmostEqual( trained_model_acc, model_acc_1, places=3, msg= f'Accuracy before and after loading does not match: {trained_model_acc} vs {model_acc_1}' )
def run(arguments): azure_info_path = arguments.get('--azure-info', None) valid_data_dir = test.expand_data_path(arguments['VALID_DATA_PATH'], azure_info_path) test_data_dir = test.expand_data_path(arguments['TEST_DATA_PATH'], azure_info_path) model_paths = RichPath.create( arguments['MODEL_PATH'], azure_info_path=azure_info_path).get_filtered_files_in_dir('*.pkl.gz') alpha = float(args['--alpha']) with Pool(int(arguments['--processes'])) as pool: results = pool.map( functools.partial(test.compute_evaluation_metrics, arguments=arguments, azure_info_path=azure_info_path, valid_data_dirs=valid_data_dir, test_data_dirs=test_data_dir, return_results=True, languages=['java'], test_valid=False), model_paths) docstring_mrrs = [x['java'][0] for x in results] func_name_mrrs = [x['java'][1] for x in results] docstring_confidence = get_confidence_interval(docstring_mrrs, alpha) func_name_confidence = get_confidence_interval(func_name_mrrs, alpha) print( f'{alpha*100}% confidence interval for mrr using docstring as the query: {docstring_confidence}' ) print( f'{alpha*100}% confidence interval for mrr using function name as the query: {func_name_confidence}' )
def test_freeze_params(self): num_features = 100 training_data, validation_data = self.__get_data(num_features) with tempfile.TemporaryDirectory() as dir: model_file = RichPath.create(dir).join('tmp.pkl.gz') model = SimpleRegression('SimpleRegressionTest', num_features) trainer = ComponentTrainer(model, model_file, max_num_epochs=50) def get_freeze_weights(): for p in model.parameters(): if len(p.shape) == 2: # Just the weights yield p trainer.train( training_data, validation_data, get_parameters_to_freeze=lambda: set(get_freeze_weights())) trained_model_acc = self.__compute_accuracy(model, validation_data) self.assertLess( trained_model_acc, .7, f'Model achieves too high accuracy but the weights were frozen, {trained_model_acc:%}' )
def test_code_data_load(): dataset_params = CodeDataset.get_default_hyperparameters() dataset = CodeDataset(dataset_params) dataset.load_data(RichPath.create(CODE_DATA_PATH), folds_to_load={DataFold.TRAIN}) for data_point in dataset._graph_iterator(DataFold.TRAIN): print(data_point.adjacency_lists)
def __init__(self, out_folder: Union[RichPath, str], file_prefix: str, max_chunk_size: int, file_suffix: str, parallel_writers: int = 0, mode: str = 'w'): self.__current_chunk = [] # type: List[T] if isinstance(out_folder, str): self.__out_folder = RichPath.create(out_folder) # type: RichPath else: self.__out_folder = out_folder self.__out_folder.make_as_dir() self.__file_prefix = file_prefix self.__max_chunk_size = max_chunk_size self.__file_suffix = file_suffix self.__mode = mode.lower() assert self.__mode in ( 'a', 'w' ), 'Mode must be either append (a) or write (w). Given: {0}'.format( mode) if self.__mode == 'w': self.__num_files_written = 0 # 'w' mode will begin writing from scratch else: self.__num_files_written = self.__get_max_existing_index( ) + 1 # 'a' mode starts after the last-written file self.__parallel_writers = parallel_writers if self.__parallel_writers > 0: self.__writer_executors = ThreadPoolExecutor( max_workers=self.__parallel_writers)
def save(self, path: RichPath) -> None: """Save the model at a given location.""" with TemporaryDirectory() as tmpdir: target_file = os.path.join(tmpdir, 'model.pkl.gz') with gzip.open(target_file, 'wb') as f: torch.save(self, f) path.copy_from(RichPath.create(target_file))
def run_train(model_class: Type[Model], train_data_dirs: List[RichPath], valid_data_dirs: List[RichPath], save_folder: str, hyperparameters: Dict[str, Any], azure_info_path: Optional[str], run_name: str, quiet: bool = False, max_files_per_dir: Optional[int] = None, parallelize: bool = True) -> RichPath: assert parallelize model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder) if os.path.exists(model.model_save_path): model = model_restore_helper.restore(RichPath.create( model.model_save_path), is_train=True) model.train_log( "Resuming training run %s of model %s with following hypers:\n%s" % (run_name, model.__class__.__name__, str(hyperparameters))) resume = True else: model.train_log( "Tokenizing and building vocabulary for code snippets and queries. This step may take several hours." ) model.load_metadata(train_data_dirs, max_files_per_dir=max_files_per_dir, parallelize=parallelize) model.make_model(is_train=True) model.train_log( "Starting training run %s of model %s with following hypers:\n%s" % (run_name, model.__class__.__name__, str(hyperparameters))) resume = False philly_job_id = os.environ.get('PHILLY_JOB_ID') if philly_job_id is not None: # We are in Philly write out the model name in an auxiliary file with open(os.path.join(save_folder, philly_job_id + '.job'), 'w') as f: f.write(os.path.basename(model.model_save_path)) wandb.config.update(model.hyperparameters) model.train_log("Loading training and validation data.") train_data = model.load_data_from_dirs(train_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize) valid_data = model.load_data_from_dirs(valid_data_dirs, is_test=False, max_files_per_dir=max_files_per_dir, parallelize=parallelize) model.train_log("Begin Training.") model_path = model.train(train_data, valid_data, azure_info_path, quiet=quiet, resume=resume) return model_path
def load_model(self): model_path = RichPath.create(self.local_model_path, None) print("Restoring model from %s" % model_path) self.model = model_restore_helper.restore(path=model_path, is_train=False, hyper_overrides={}) for language in ['python', 'go', 'javascript', 'java', 'php', 'ruby']: # for language in ['python']: print("Loading language: %s" % language) self.definitions[language] = pickle.load( open( '../resources/data/{}_dedupe_definitions_v2.pkl'.format( language), 'rb')) if os.path.exists('/datadrive/{}.ann'.format(language)): self.indices[language] = AnnoyIndex(128, 'angular') self.indices[language].load( '/datadrive/{}.ann'.format(language)) else: indexes = [{ 'code_tokens': d['function_tokens'], 'language': d['language'] } for d in tqdm(self.definitions[language])] code_representations = self.model.get_code_representations( indexes) print(code_representations[0].shape) self.indices[language] = AnnoyIndex( code_representations[0].shape[0], 'angular') for index, vector in tqdm(enumerate(code_representations)): assert vector is not None self.indices[language].add_item(index, vector) self.indices[language].build(1000) self.indices[language].save( '/datadrive/{}.ann'.format(language))
def test_cache_correctness(self): with self._setup_test() as az_info: for suffix in ('.jsonl.gz', '.msgpack.l.gz'): random_elements = list(range(100)) remote_path = RichPath.create("azure://devstoreaccount1/test1/compressed/data" + suffix, az_info) remote_path.save_as_compressed_file(random_elements) # Read once read_nums = list(remote_path.read_by_file_suffix()) self.assertListEqual(read_nums, random_elements) # Hit Cache read_nums = list(remote_path.read_by_file_suffix()) self.assertListEqual(read_nums, random_elements) self.assertTrue(remote_path.exists()) self.assertTrue(remote_path.is_file()) # Update file through other means, and ensure that cache is appropriately invalidated. new_elements = list(range(500)) with TemporaryDirectory() as tmp: path = os.path.join(tmp, 'tst'+suffix) if suffix == '.jsonl.gz': save_jsonl_gz(new_elements, path) else: save_msgpack_l_gz(new_elements, path) container_client = ContainerClient.from_connection_string(self.AZURITE_DEVELOPMENT_CONNECTION_STRING, "test1") blob_client = container_client.get_blob_client("compressed/data" + suffix) with open(path, 'rb') as f: blob_client.upload_blob(f, overwrite=True) read_nums = list(remote_path.read_by_file_suffix()) self.assertListEqual(read_nums, new_elements) self.assertTrue(remote_path.exists()) self.assertTrue(remote_path.is_file())
def run_train(model_class: Type[Model], train_data_path: RichPath, valid_data_path: RichPath, save_folder: str, hyperparameters: Dict[str, Any], run_name: Optional[str]=None, quiet: bool=False) \ -> RichPath: train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*') valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*') model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder) if os.path.exists(model.model_save_path): model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True) model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps( hyperparameters))) resume = True else: model.load_existing_metadata(train_data_path.join('metadata.pkl.gz')) model.make_model(is_train=True) model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'], model.__class__.__name__, json.dumps(hyperparameters))) resume = False model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume) return model_path
def expand_data_path(data_path: str, azure_info_path: Optional[str]) -> List[RichPath]: """ Args: data_path: A path to either a file or a directory. If it's a file, we interpret it as a list of data directories. Returns: List of data directories (potentially just data_path) """ data_rpath = RichPath.create(data_path, azure_info_path) if data_rpath.is_dir(): return [data_rpath] return [RichPath.create(data_dir, azure_info_path) for data_dir in data_rpath.read_as_text().splitlines()]