示例#1
0
def run(args):

    azure_info_path = args.get('--azure-info', None)
    input_path = RichPath.create(args['INPUT_FILENAME'], azure_info_path)
    output_folder = RichPath.create(args['OUTPUT_FOLDER'], azure_info_path)
    train = float(args['--train-ratio'])
    valid = float(args['--valid-ratio'])
    test = float(args['--test-ratio'])
    holdout = float(args['--holdout-ratio'])

    # get data and process it
    df = jsonl_to_df(input_path)
    print('Removing fuzzy duplicates ... this may take some time.')
    df = remove_duplicate_code_df(df)
    df = df.sample(frac=1, random_state=20181026)  # shuffle order of files
    df = label_folds(df, train_ratio=train, valid_ratio=valid, test_ratio=test, holdout_ratio=holdout)
    splits = ['train', 'valid', 'test', 'holdout']

    for split in splits:
        split_df = df[df.partition == split]

        # save dataframes as chunked jsonl files
        jsonl_save_folder = output_folder.join(f'jsonl/{split}')
        print(f'Uploading data to {str(jsonl_save_folder)}')
        chunked_save_df_to_jsonl(split_df, jsonl_save_folder)

        # Upload dataframes to Azure
        filename = f'/tmp/{split}_df.pkl'
        df_save_path = output_folder.join(f'DataFrame/{split}_df.pkl')
        split_df.to_pickle(filename)
        print(f'Uploading data to {str(df_save_path)}')
        df_save_path.copy_from(RichPath.create(filename))
        os.unlink(filename)
示例#2
0
def qm9_train_valid_paths(tmp_data_dir):
    """
    我们代码数据集的格式和qm9数据集的格式比较相似,可以按照qm9的格式进行处理
    主要包含三部分的数据
    图表示形式:点边点的三元组
    节点的特征,使用one-hot编码
    目标值(预测值)
    """
    train_valid_paths = [
        os.path.join(tmp_data_dir, f"{split}.jsonl.gz")
        for split in ["train", "valid"]
    ]

    data_samples = 5 * [{
        "graph": [(0, 1, 1)
                  ],  # Edge between vertices 0 and 1, with type 1. 使用点边点的形式表示
        "node_features": [[1, 0], [
            0, 1
        ]],  # Two nodes, with features of shape (2,). 使用one-hot形式编码的节点的特征
        "targets": [[1.0]
                    ],  # Target value for the graph. 图结构需要预测的目标能量值,是一个32位的浮点数
    }]

    for path in train_valid_paths:
        RichPath.create(path).save_as_compressed_file(data_samples)

    # 第一次调用时创建这些临时文件,第二次调用时删除它们
    yield train_valid_paths

    for path in train_valid_paths:
        os.remove(path)
def run(arguments, tag_in_vcs=False) -> RichPath:
    azure_info_path = arguments.get('--azure-info', None)
    train_folder = RichPath.create(arguments['TRAIN_DATA_PATH'],
                                   azure_info_path)
    valid_folder = RichPath.create(arguments['VALID_DATA_PATH'],
                                   azure_info_path)
    save_folder = arguments['SAVE_FOLDER']

    assert train_folder.is_dir(), "%s is not a folder" % (train_folder, )
    assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder, )

    model_class = get_model_class_from_name(arguments.get('--model', 'nag'))

    hyperparameters = model_class.get_default_hyperparameters()
    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))
    hyperparameters['run_id'] = make_run_id(arguments)

    os.makedirs(save_folder, exist_ok=True)

    if tag_in_vcs:
        hyperparameters['git_commit'] = git_tag_run(hyperparameters['run_id'])

    return run_train(model_class, train_folder, valid_folder, save_folder,
                     hyperparameters, arguments.get('--run-name'),
                     arguments.get('--quiet', False))
示例#4
0
    def test_copy_from(self):
        with self._setup_test() as az_info, TemporaryDirectory() as tmp_dir:
            elements = [[i, i//2] for i in range(10000)]
            tmp_local_path = RichPath.create(tmp_dir).join("sample.json.gz")
            tmp_local_path.save_as_compressed_file(elements)

            remote_path1 = RichPath.create(f"azure://devstoreaccount1/test1/sample1.json.gz", az_info)
            self.assertFalse(remote_path1.exists())

            remote_path1.copy_from(tmp_local_path)
            tmp_local_path.delete()

            self.assertFalse(tmp_local_path.exists())
            self.assertTrue(remote_path1.exists())

            read_elements = remote_path1.read_by_file_suffix()
            self.assertListEqual(elements, read_elements)

            remote_path2 = RichPath.create(f"azure://devstoreaccount1/test1/sample2.json.gz", az_info)
            remote_path2.copy_from(remote_path1)
            remote_path1.delete()

            read_elements = remote_path2.read_by_file_suffix()
            self.assertListEqual(elements, read_elements)

            read_elements = remote_path2.to_local_path().read_by_file_suffix()
            self.assertListEqual(elements, read_elements)
            remote_path2.delete()
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path)
    model_path = RichPath.create(arguments['MODEL_PATH'])
    output_folder = arguments['OUTPUT_FOLDER']
    os.makedirs(output_folder, exist_ok=True)
    num_processes = int(arguments['--num-processes'])
    run_test(model_path, test_folder, output_folder, num_processes)
示例#6
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path)
    model_path = RichPath.create(arguments['MODEL_PATH'])
    type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'], azure_info_path)
    alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA'], azure_info_path)
    run_test(model_path, test_folder, type_lattice_path, alias_metadata_path,
             arguments['--print-predictions'])
示例#7
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    predictions_path = RichPath.create(arguments['PREDICTIONS_JSONL_GZ'],
                                       azure_info_path)
    type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'],
                                        azure_info_path)
    alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA_PATH'],
                                          azure_info_path)
    compute(predictions_path, type_lattice_path, alias_metadata_path)
示例#8
0
    def test_simple_read_write(self):
        with self._setup_test() as az_info:
            remote_path = RichPath.create(
                "azure://devstoreaccount1/test1/remote_path.txt", az_info)
            with TemporaryDirectory() as tmp_dir:
                data_f = os.path.join(tmp_dir, 'testdata.txt')
                with open(data_f, 'w') as f:
                    f.write("hello!")
                local_path = RichPath.create(data_f)
                self.assertEqual(local_path.read_as_text(), "hello!")
                local_size = local_path.get_size()

                remote_path.copy_from(local_path)
                self.assertTrue(local_path.exists())
                local_path.delete()
                self.assertFalse(local_path.exists())
                local_path.delete()
                with self.assertRaises(Exception):
                    local_path.delete(missing_ok=False)

            self.assertEqual(remote_path.read_as_text(), "hello!")

            # Read once again (should trigger cache)
            self.assertEqual(remote_path.read_as_text(), "hello!")

            self.assertTrue(remote_path.exists())
            self.assertTrue(remote_path.is_file())
            self.assertFalse(remote_path.is_dir())
            self.assertEqual(local_size, remote_path.get_size())

            local_path = remote_path.to_local_path()
            self.assertTrue(local_path.exists())
            os.path.exists(local_path.path)
            with open(local_path.path, 'r') as f:
                self.assertEqual(f.read(), "hello!")

            # Delete file
            remote_path.delete()
            self.assertFalse(remote_path.exists())
            remote_path.delete()  # Should not raise Exception
            with self.assertRaises(FileNotFoundError):
                remote_path.delete(missing_ok=False)

            # Other random remote_path does not exist
            remote_path = RichPath.create(
                "azure://devstoreaccount1/test1/remote_path2.txt", az_info)
            self.assertFalse(remote_path.exists())
            self.assertFalse(remote_path.is_dir())
            self.assertFalse(remote_path.is_file())

            with self.assertRaises(Exception):
                remote_path.read_as_text()

            with self.assertRaises(Exception):
                remote_path.get_size()
示例#9
0
 def parse_jsonl_file(self, file: Path, output_folder: Path) -> None:
     print(f"Parsing data in {file}")
     input_file = RichPath.create(str(file))
     output_file = RichPath.create(
         str(output_folder / input_file.basename()))
     parsed_code = [
         self.process_data(self.extract_from_raw_data(raw_json_object))
         for raw_json_object in tqdm(input_file.read_by_file_suffix())
     ]
     print(f"Saving processed data in {output_file}")
     output_file.save_as_compressed_file(parsed_code)
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    type_lattice_file = RichPath.create(arguments['TYPE_LATTICE_FILE'],
                                        azure_info_path)
    output_folder = RichPath.create(arguments['OUTPUT_BASE_FOLDER'],
                                    azure_info_path)
    input_folders, input_folder_basenames = [], set()
    print(arguments['INPUT_FOLDER'])
    print(arguments['TYPE_LATTICE_FILE'])
    print(arguments['OUTPUT_BASE_FOLDER'])
    for input_folder_name in arguments['INPUT_FOLDER']:
        input_folder_basename = os.path.basename(input_folder_name)
        if input_folder_basename in input_folder_basenames:
            raise ValueError("Several input folders with same basename '%s'!" %
                             (input_folder_basename, ))
        input_folder_basenames.add(input_folder_basename)
        input_folder = RichPath.create(input_folder_name)
        assert input_folder.is_dir(), "%s is not a folder" % (input_folder, )
        input_folders.append(input_folder)

    model_class = get_model_class_from_name(arguments.get('--model', 'nag'))
    hyperparameters = model_class.get_default_hyperparameters()
    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))

    model = model_class(hyperparameters, run_name=arguments.get('--run-name'))

    metadata_to_use = arguments.get('--metadata-to-use', None)
    if metadata_to_use is None:
        train_folder = input_folders[0]
        model.load_metadata(train_folder,
                            type_lattice_file,
                            max_num_files=int(arguments['--max-num-files']))
    else:
        metadata_path = RichPath.create(metadata_to_use, azure_info_path)
        model.load_existing_metadata(metadata_path)

    for_test = args.get('--for-test', False)
    model.make_model(is_train=not for_test)

    for input_folder in input_folders:
        input_folder_basename = input_folder.basename()
        this_output_folder = output_folder.join(input_folder_basename)
        this_output_folder.make_as_dir()
        model.tensorise_data_in_dir(input_folder,
                                    this_output_folder,
                                    for_test=for_test,
                                    max_num_files=int(
                                        arguments['--max-num-files']))
示例#11
0
def start(train_data_path, valid_folder_path, save_folder_path, model_class):

    train_folder = RichPath.create(train_data_path)
    valid_folder = RichPath.create(valid_folder_path)
    os.makedirs(save_folder_path, exist_ok=True)

    assert train_folder.is_dir(), "%s is not a folder" % (train_folder,)
    assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder,)

    model_class = get_model_class_from_name(model_class)
    hyperparameters = model_class.get_default_hyperparameters()
    hyperparameters['run_id'] = "%s-%s" % (model_class.__class__.__name__, time.strftime("%Y-%m-%d-%H-%M-%S"))

    return run_train(model_class, train_folder, valid_folder, save_folder_path, hyperparameters)
示例#12
0
    def test_connection_types(self):
        for auth_type in AuthType:
            with self.subTest(f"Test {auth_type}"), self._setup_test(auth_type=auth_type) as az_info, TemporaryDirectory() as tmp_dir:
                data_f = os.path.join(tmp_dir, 'testtext.txt')
                with open(data_f, 'w') as f:
                    f.write("hello!")
                local_path = RichPath.create(data_f)

                remote_path = RichPath.create("azure://devstoreaccount1/test1/test_text.txt", az_info)
                remote_path.copy_from(local_path)
                local_path.delete()

                self.assertEqual(remote_path.read_as_text(), "hello!")
                remote_path.delete()
示例#13
0
def run(arguments) -> None:
    print("Loading data ...")
    dataset_params = JsonLMethod2CommentDataset.get_default_hyperparameters()
    dataset = JsonLMethod2CommentDataset(dataset_params)
    data_description = dataset.get_batch_tf_data_description()

    model = LanguageModel.restore(arguments["TRAINED_MODEL"], data_description.batch_features_shapes)
    print(f"  Loaded trained model from {arguments['TRAINED_MODEL']}.")

    dataset.load_vocab(model.vocab_source, model.vocab_target)
    data_path = RichPath.create(
        os.path.join(os.path.dirname(__file__), ".", "datasets/" + args['TEST_DATA_DIR'])
    )
    dataset.load_data(data_path, folds_to_load=[DataFold.VALIDATION])
    test_data = dataset.get_tensorflow_dataset(DataFold.VALIDATION)

    print(
        f"  Loaded {len(list(test_data))} testing samples."
    )

    test_loss, test_acc, test_true, test_pred = model.run_one_epoch(
            test_data,
            training=False,
        )
      
    test_bleu, test_nist, test_dist, test_rouge2, test_rougel = calculate_metrics(test_true, test_pred)
    print(f"Test:  Loss {test_loss:.4f}, Acc {test_acc:.3f}, BLEU {test_bleu:.3f}")
    print(f"       NIST {test_nist:.3f}, DIST {test_dist:.3f}, ROUGE-2 {test_rouge2:.3f}, ROUGE-L {test_rougel:.3f}")
示例#14
0
    def load_data(self,
                  path: RichPath,
                  folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        if path is None:
            path = RichPath.create(self.default_data_directory())
        logger.info("Starting to load data.")

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {
                DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST
            }

        if DataFold.TRAIN in folds_to_load:
            self._loaded_data[DataFold.TRAIN] = self.__load_data(
                path.join("train.jsonl.gz"))
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(
                path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(
                path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
示例#15
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path)
    output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path)

    with ChunkWriter(output_folder,
                     file_prefix='codedata',
                     max_chunk_size=500,
                     file_suffix='.jsonl.gz') as chunked_writer:
        for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'):
            for line in file.read_by_file_suffix():
                tokens = line['code_tokens']
                chunked_writer.add(
                    dict(filename='%s:%s:%s' %
                         (line['repo'], line['path'], line['lineno']),
                         tokens=tokens))
示例#16
0
    def train(self, train_data: List[RichPath], valid_data: List[RichPath], quiet: bool=False, resume: bool=False) -> RichPath:
        model_path = RichPath.create(self.model_save_path)
        with self.__sess.as_default():
            tf.set_random_seed(self.hyperparameters['seed'])

            if resume:
                # Variables should have been restored.
                best_val_loss = self.__run_epoch_in_batches(valid_data, "RESUME (valid)", is_train=False, quiet=quiet)
                self.train_log('Validation Loss on Resume: %.6f' % (best_val_loss,))
            else:
                init_op = tf.variables_initializer(self.__sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
                self.__sess.run(init_op)
                self.save(model_path)
                best_val_loss = float("inf")
            no_improvement_counter = 0
            epoch_number = 0
            while (epoch_number < self.hyperparameters['max_epochs']
                   and no_improvement_counter < self.hyperparameters['patience']):
                self.train_log('==== Epoch %i ====' % (epoch_number,))
                train_loss = self.__run_epoch_in_batches(train_data, "%i (train)" % (epoch_number,), is_train=True, quiet=quiet)
                self.train_log(' Training Loss: %.6f' % (train_loss,))
                val_loss = self.__run_epoch_in_batches(valid_data, "%i (valid)" % (epoch_number,), is_train=False, quiet=quiet)
                self.train_log(' Validation Loss: %.6f' % (val_loss,))

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    no_improvement_counter = 0
                    self.save(model_path)
                    self.train_log("  Best result so far -- saving model as '%s'." % (model_path,))
                else:
                    no_improvement_counter += 1
                epoch_number += 1
        return model_path
示例#17
0
def test(model_path: str, test_data_path: Optional[RichPath], result_dir: str, quiet: bool = False, run_id: str = None):
    model = restore(model_path, result_dir, run_id)
    model.params['max_nodes_in_batch'] = 2 * model.params['max_nodes_in_batch']  # We can process larger batches if we don't do training
    test_data_path = test_data_path or RichPath.create(model.task.default_data_path())
    model.log_line(" Using the following task params: %s" % json.dumps(model.task.params))
    model.log_line(" Using the following model params: %s" % json.dumps(model.params))
    model.test(test_data_path)
示例#18
0
def jsonl_test_case():
    """
    加载data/test_datasets/..下的train.jsonl.gz文件和valid.jsonl.gz文件
    """
    dataset_params = JsonLGraphPropertyDataset.get_default_hyperparameters()
    dataset_params["num_fwd_edge_types"] = 4

    dataset = JsonLGraphPropertyDataset(dataset_params)
    data_path = RichPath.create(
        os.path.join(os.path.dirname(__file__), "..", "test_datasets"))
    dataset.load_data(data_path,
                      folds_to_load={DataFold.TRAIN, DataFold.VALIDATION})

    return TestCase(
        dataset=dataset,
        expected=TestExpectedValues(
            num_edge_types=dataset_params["num_fwd_edge_types"] + 1,
            node_feature_shape=(35, ),
            num_train_samples=10,
            num_valid_samples=10,
            labels_key_name="target_value",
            add_self_loop_edges=dataset_params["add_self_loop_edges"],
            tie_fwd_bkwd_edges=dataset_params["tie_fwd_bkwd_edges"],
            self_loop_edge_type=0,
        ),
    )
示例#19
0
def test_train_code_model():
    data_path = RichPath.create(CODE_FILE_PATH)
    # 根据参数获得模型和数据集
    dataset, model = get_model_and_dataset(
        msg_passing_implementation='RGCN',
        data_path=data_path,
        task_name='Code',
        cli_data_hyperparameter_overrides=None,
        cli_model_hyperparameter_overrides=None,
        hyperdrive_hyperparameter_overrides={},
        folds_to_load={DataFold.TRAIN},
        load_weights_only=False,
        disable_tf_function_build=False,
        trained_model_file=None)

    def log(msg):
        log_line(LOG_FILE_PATH, msg)

    trained_model_path = train(model,
                               dataset,
                               log_fun=log,
                               run_id='1',
                               max_epochs=10,
                               patience=1,
                               save_dir=SAVE_MODEL_PATH,
                               quiet=False,
                               aml_run=None)
    print(trained_model_path)
示例#20
0
    def test_train_model(self):
        num_features = 100
        training_data, validation_data = self.__get_data(num_features)

        with tempfile.TemporaryDirectory() as dir:
            model_file = RichPath.create(dir).join('tmp.pkl.gz')

            model = SimpleRegression('SimpleRegressionTest', num_features)
            trainer = ComponentTrainer(model, model_file, max_num_epochs=50)
            trainer.train(training_data,
                          validation_data,
                          parallel_minibatch_creation=True)
            model_acc_1 = self.__compute_accuracy(model, validation_data)

            trained_model = SimpleRegression.restore_model(
                model_file)  # type: SimpleRegression
            trained_model_acc = self.__compute_accuracy(
                trained_model, validation_data)
            self.assertGreater(
                trained_model_acc, .95,
                f'Model achieves too low accuracy, {trained_model_acc:%}')

            self.assertAlmostEqual(
                trained_model_acc,
                model_acc_1,
                places=3,
                msg=
                f'Accuracy before and after loading does not match: {trained_model_acc} vs {model_acc_1}'
            )
示例#21
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    valid_data_dir = test.expand_data_path(arguments['VALID_DATA_PATH'],
                                           azure_info_path)
    test_data_dir = test.expand_data_path(arguments['TEST_DATA_PATH'],
                                          azure_info_path)
    model_paths = RichPath.create(
        arguments['MODEL_PATH'],
        azure_info_path=azure_info_path).get_filtered_files_in_dir('*.pkl.gz')
    alpha = float(args['--alpha'])

    with Pool(int(arguments['--processes'])) as pool:
        results = pool.map(
            functools.partial(test.compute_evaluation_metrics,
                              arguments=arguments,
                              azure_info_path=azure_info_path,
                              valid_data_dirs=valid_data_dir,
                              test_data_dirs=test_data_dir,
                              return_results=True,
                              languages=['java'],
                              test_valid=False), model_paths)

    docstring_mrrs = [x['java'][0] for x in results]
    func_name_mrrs = [x['java'][1] for x in results]

    docstring_confidence = get_confidence_interval(docstring_mrrs, alpha)
    func_name_confidence = get_confidence_interval(func_name_mrrs, alpha)

    print(
        f'{alpha*100}% confidence interval for mrr using docstring as the query: {docstring_confidence}'
    )
    print(
        f'{alpha*100}% confidence interval for mrr using function name as the query: {func_name_confidence}'
    )
示例#22
0
    def test_freeze_params(self):
        num_features = 100
        training_data, validation_data = self.__get_data(num_features)

        with tempfile.TemporaryDirectory() as dir:
            model_file = RichPath.create(dir).join('tmp.pkl.gz')

            model = SimpleRegression('SimpleRegressionTest', num_features)
            trainer = ComponentTrainer(model, model_file, max_num_epochs=50)

            def get_freeze_weights():
                for p in model.parameters():
                    if len(p.shape) == 2:  # Just the weights
                        yield p

            trainer.train(
                training_data,
                validation_data,
                get_parameters_to_freeze=lambda: set(get_freeze_weights()))
            trained_model_acc = self.__compute_accuracy(model, validation_data)

            self.assertLess(
                trained_model_acc, .7,
                f'Model achieves too high accuracy but the weights were frozen, {trained_model_acc:%}'
            )
示例#23
0
def test_code_data_load():
    dataset_params = CodeDataset.get_default_hyperparameters()
    dataset = CodeDataset(dataset_params)
    dataset.load_data(RichPath.create(CODE_DATA_PATH),
                      folds_to_load={DataFold.TRAIN})
    for data_point in dataset._graph_iterator(DataFold.TRAIN):
        print(data_point.adjacency_lists)
示例#24
0
    def __init__(self,
                 out_folder: Union[RichPath, str],
                 file_prefix: str,
                 max_chunk_size: int,
                 file_suffix: str,
                 parallel_writers: int = 0,
                 mode: str = 'w'):
        self.__current_chunk = []  # type: List[T]
        if isinstance(out_folder, str):
            self.__out_folder = RichPath.create(out_folder)  # type: RichPath
        else:
            self.__out_folder = out_folder
        self.__out_folder.make_as_dir()
        self.__file_prefix = file_prefix
        self.__max_chunk_size = max_chunk_size
        self.__file_suffix = file_suffix

        self.__mode = mode.lower()
        assert self.__mode in (
            'a', 'w'
        ), 'Mode must be either append (a) or write (w). Given: {0}'.format(
            mode)

        if self.__mode == 'w':
            self.__num_files_written = 0  # 'w' mode will begin writing from scratch
        else:
            self.__num_files_written = self.__get_max_existing_index(
            ) + 1  # 'a' mode starts after the last-written file

        self.__parallel_writers = parallel_writers
        if self.__parallel_writers > 0:
            self.__writer_executors = ThreadPoolExecutor(
                max_workers=self.__parallel_writers)
示例#25
0
 def save(self, path: RichPath) -> None:
     """Save the model at a given location."""
     with TemporaryDirectory() as tmpdir:
         target_file = os.path.join(tmpdir, 'model.pkl.gz')
         with gzip.open(target_file, 'wb') as f:
             torch.save(self, f)
         path.copy_from(RichPath.create(target_file))
def run_train(model_class: Type[Model],
              train_data_dirs: List[RichPath],
              valid_data_dirs: List[RichPath],
              save_folder: str,
              hyperparameters: Dict[str, Any],
              azure_info_path: Optional[str],
              run_name: str,
              quiet: bool = False,
              max_files_per_dir: Optional[int] = None,
              parallelize: bool = True) -> RichPath:
    assert parallelize
    model = model_class(hyperparameters,
                        run_name=run_name,
                        model_save_dir=save_folder,
                        log_save_dir=save_folder)
    if os.path.exists(model.model_save_path):
        model = model_restore_helper.restore(RichPath.create(
            model.model_save_path),
                                             is_train=True)
        model.train_log(
            "Resuming training run %s of model %s with following hypers:\n%s" %
            (run_name, model.__class__.__name__, str(hyperparameters)))
        resume = True
    else:
        model.train_log(
            "Tokenizing and building vocabulary for code snippets and queries.  This step may take several hours."
        )
        model.load_metadata(train_data_dirs,
                            max_files_per_dir=max_files_per_dir,
                            parallelize=parallelize)
        model.make_model(is_train=True)

        model.train_log(
            "Starting training run %s of model %s with following hypers:\n%s" %
            (run_name, model.__class__.__name__, str(hyperparameters)))
        resume = False

    philly_job_id = os.environ.get('PHILLY_JOB_ID')
    if philly_job_id is not None:
        # We are in Philly write out the model name in an auxiliary file
        with open(os.path.join(save_folder, philly_job_id + '.job'), 'w') as f:
            f.write(os.path.basename(model.model_save_path))

    wandb.config.update(model.hyperparameters)
    model.train_log("Loading training and validation data.")
    train_data = model.load_data_from_dirs(train_data_dirs,
                                           is_test=False,
                                           max_files_per_dir=max_files_per_dir,
                                           parallelize=parallelize)
    valid_data = model.load_data_from_dirs(valid_data_dirs,
                                           is_test=False,
                                           max_files_per_dir=max_files_per_dir,
                                           parallelize=parallelize)
    model.train_log("Begin Training.")
    model_path = model.train(train_data,
                             valid_data,
                             azure_info_path,
                             quiet=quiet,
                             resume=resume)
    return model_path
示例#27
0
    def load_model(self):
        model_path = RichPath.create(self.local_model_path, None)
        print("Restoring model from %s" % model_path)
        self.model = model_restore_helper.restore(path=model_path,
                                                  is_train=False,
                                                  hyper_overrides={})

        for language in ['python', 'go', 'javascript', 'java', 'php', 'ruby']:
            # for language in ['python']:
            print("Loading language: %s" % language)
            self.definitions[language] = pickle.load(
                open(
                    '../resources/data/{}_dedupe_definitions_v2.pkl'.format(
                        language), 'rb'))

            if os.path.exists('/datadrive/{}.ann'.format(language)):
                self.indices[language] = AnnoyIndex(128, 'angular')
                self.indices[language].load(
                    '/datadrive/{}.ann'.format(language))
            else:
                indexes = [{
                    'code_tokens': d['function_tokens'],
                    'language': d['language']
                } for d in tqdm(self.definitions[language])]
                code_representations = self.model.get_code_representations(
                    indexes)
                print(code_representations[0].shape)
                self.indices[language] = AnnoyIndex(
                    code_representations[0].shape[0], 'angular')
                for index, vector in tqdm(enumerate(code_representations)):
                    assert vector is not None
                    self.indices[language].add_item(index, vector)
                self.indices[language].build(1000)
                self.indices[language].save(
                    '/datadrive/{}.ann'.format(language))
示例#28
0
    def test_cache_correctness(self):
        with self._setup_test() as az_info:
            for suffix in ('.jsonl.gz', '.msgpack.l.gz'):
                random_elements = list(range(100))
                remote_path = RichPath.create("azure://devstoreaccount1/test1/compressed/data" + suffix, az_info)
                remote_path.save_as_compressed_file(random_elements)

                # Read once
                read_nums = list(remote_path.read_by_file_suffix())
                self.assertListEqual(read_nums, random_elements)

                # Hit Cache
                read_nums = list(remote_path.read_by_file_suffix())
                self.assertListEqual(read_nums, random_elements)
                self.assertTrue(remote_path.exists())
                self.assertTrue(remote_path.is_file())

                # Update file through other means, and ensure that cache is appropriately invalidated.
                new_elements = list(range(500))
                with TemporaryDirectory() as tmp:
                    path = os.path.join(tmp, 'tst'+suffix)
                    if suffix == '.jsonl.gz':
                        save_jsonl_gz(new_elements, path)
                    else:
                        save_msgpack_l_gz(new_elements, path)
                    container_client = ContainerClient.from_connection_string(self.AZURITE_DEVELOPMENT_CONNECTION_STRING,
                                                                              "test1")
                    blob_client = container_client.get_blob_client("compressed/data" + suffix)
                    with open(path, 'rb') as f:
                        blob_client.upload_blob(f, overwrite=True)

                read_nums = list(remote_path.read_by_file_suffix())
                self.assertListEqual(read_nums, new_elements)
                self.assertTrue(remote_path.exists())
                self.assertTrue(remote_path.is_file())
示例#29
0
def run_train(model_class: Type[Model],
              train_data_path: RichPath,
              valid_data_path: RichPath,
              save_folder: str,
              hyperparameters: Dict[str, Any],
              run_name: Optional[str]=None,
              quiet: bool=False) \
        -> RichPath:
    train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*')
    valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*')

    model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder)
    if os.path.exists(model.model_save_path):
        model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True)
        model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(
                                                                                                 hyperparameters)))
        resume = True
    else:
        model.load_existing_metadata(train_data_path.join('metadata.pkl.gz'))
        model.make_model(is_train=True)
        model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(hyperparameters)))
        resume = False
    model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume)
    return model_path
def expand_data_path(data_path: str, azure_info_path: Optional[str]) -> List[RichPath]:
    """
    Args:
        data_path: A path to either a file or a directory. If it's a file, we interpret it as a list of
            data directories.

    Returns:
        List of data directories (potentially just data_path)
    """
    data_rpath = RichPath.create(data_path, azure_info_path)

    if data_rpath.is_dir():
        return [data_rpath]

    return [RichPath.create(data_dir, azure_info_path)
            for data_dir in data_rpath.read_as_text().splitlines()]