def load_data(self,
                  path: RichPath,
                  folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        logger.info(f"Starting to load data from {path}.")
        self.load_metadata(path)

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {
                DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST
            }

        if DataFold.TRAIN in folds_to_load:
            self._loaded_data[DataFold.TRAIN] = self.__load_data(
                path.join("train.jsonl.gz"))
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(
                path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(
                path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
示例#2
0
def run_stats(graph_path: RichPath, output_path: RichPath):
    number_graphs, number_annotations, number_variables = 0, 0, 0
    annotation_table = Counter()
    data_generator = chain(
        *(g.read_as_jsonl()
          for g in graph_path.iterate_filtered_files_in_dir('*.jsonl.gz')))
    for data in data_generator:
        number_graphs += 1 if len(data['supernodes']) > 0 else 0
        number_variables += len(data['supernodes'])
        number_annotations += sum(
            1 for supernode in data['supernodes'].values()
            if supernode['annotation'] not in {None, 'None', 'Nothing', 'Any'})
        annotation_table.update((supernode['annotation']
                                 for supernode in data['supernodes'].values()
                                 if supernode['annotation'] not in
                                 {None, 'None', 'Nothing', 'Any'}))
    with open(output_path.to_local_path().path, "a") as f:
        f.write("Statistics for file: " + graph_path.to_local_path().path +
                "\n")
        f.write("Number of graphs: %d\n" % (number_graphs))
        f.write("Number of variables: %d\n" % (number_variables))
        f.write("Number of annotations: %d\n" % (number_annotations))
        f.write("Number of different annotations: %d\n" %
                (len(list(annotation_table))))
        f.write("\nFrequency distribution of annotations type:\n\n")
        for annotation, value in annotation_table.most_common():
            f.write("%s\t%d\n" % (annotation, value))
def run(arguments, tag_in_vcs=False) -> RichPath:
    azure_info_path = arguments.get('--azure-info', None)
    train_folder = RichPath.create(arguments['TRAIN_DATA_PATH'],
                                   azure_info_path)
    valid_folder = RichPath.create(arguments['VALID_DATA_PATH'],
                                   azure_info_path)
    save_folder = arguments['SAVE_FOLDER']

    assert train_folder.is_dir(), "%s is not a folder" % (train_folder, )
    assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder, )

    model_class = get_model_class_from_name(arguments.get('--model', 'nag'))

    hyperparameters = model_class.get_default_hyperparameters()
    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))
    hyperparameters['run_id'] = make_run_id(arguments)

    os.makedirs(save_folder, exist_ok=True)

    if tag_in_vcs:
        hyperparameters['git_commit'] = git_tag_run(hyperparameters['run_id'])

    return run_train(model_class, train_folder, valid_folder, save_folder,
                     hyperparameters, arguments.get('--run-name'),
                     arguments.get('--quiet', False))
    def load_data(self,
                  path: RichPath,
                  folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        logger.info(f"Starting to load data from {path}.")

        if self.metadata == {}:
            metadata_path = path.join("metadata.pkl.gz")
            if metadata_path.exists():
                logger.info(f"Loading metadata from {metadata_path}")
                self._metadata = metadata_path.read_by_file_suffix()
        else:
            logger.warning(
                "Using metadata passed to constructor, not metadata stored with data."
            )

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {
                DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST
            }

        if DataFold.TRAIN in folds_to_load:
            self._loaded_data[DataFold.TRAIN] = self.__load_data(
                path.join("train.jsonl.gz"))
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(
                path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(
                path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
示例#5
0
    def __init__(self, type_lattice_path: RichPath, bottom_symbol: str,
                 built_in_aliases_path: RichPath):
        reference_lattice = type_lattice_path.read_as_json()
        id_aliases = {}
        self.__lattice_node_to_id = {}
        for i, n in enumerate(reference_lattice['nodes']):
            if n in self.__lattice_node_to_id:
                id_aliases[i] = self.__lattice_node_to_id[n]
            else:
                self.__lattice_node_to_id[n] = i

        assert self.__lattice_node_to_id[
            bottom_symbol] == 0, 'We assume below that bottom is at position 0'
        self.__id_to_node = reference_lattice['nodes']

        def alias_or_self(idx: int) -> int:
            if idx in id_aliases:
                return id_aliases[idx]
            else:
                return idx

        self.__aliases = dict(
            built_in_aliases_path.read_as_json()['aliasing_rules'])

        self.__is_a_edges = defaultdict(set)
        self.__child_edges = defaultdict(set)
        for from_id, to_id in reference_lattice['edges']:
            from_id = alias_or_self(from_id)
            to_id = alias_or_self(to_id)

            self.__is_a_edges[from_id].add(to_id)
            self.__child_edges[to_id].add(from_id)

        self._force_fix_lattice()
示例#6
0
def run(args):

    azure_info_path = args.get('--azure-info', None)
    input_path = RichPath.create(args['INPUT_FILENAME'], azure_info_path)
    output_folder = RichPath.create(args['OUTPUT_FOLDER'], azure_info_path)
    train = float(args['--train-ratio'])
    valid = float(args['--valid-ratio'])
    test = float(args['--test-ratio'])
    holdout = float(args['--holdout-ratio'])

    # get data and process it
    df = jsonl_to_df(input_path)
    print('Removing fuzzy duplicates ... this may take some time.')
    df = remove_duplicate_code_df(df)
    df = df.sample(frac=1, random_state=20181026)  # shuffle order of files
    df = label_folds(df, train_ratio=train, valid_ratio=valid, test_ratio=test, holdout_ratio=holdout)
    splits = ['train', 'valid', 'test', 'holdout']

    for split in splits:
        split_df = df[df.partition == split]

        # save dataframes as chunked jsonl files
        jsonl_save_folder = output_folder.join(f'jsonl/{split}')
        print(f'Uploading data to {str(jsonl_save_folder)}')
        chunked_save_df_to_jsonl(split_df, jsonl_save_folder)

        # Upload dataframes to Azure
        filename = f'/tmp/{split}_df.pkl'
        df_save_path = output_folder.join(f'DataFrame/{split}_df.pkl')
        split_df.to_pickle(filename)
        print(f'Uploading data to {str(df_save_path)}')
        df_save_path.copy_from(RichPath.create(filename))
        os.unlink(filename)
示例#7
0
 def load_data(self, path: RichPath) -> None:
     # Note that as __load_data produces a generator, we explicitly force loading
     # (and caching) here:
     self._loaded_data[DataFold.TRAIN] = \
         list(self.__load_data(path.join("graphs-train"), DataFold.TRAIN))
     self._loaded_data[DataFold.VALIDATION] = \
         list(self.__load_data(path.join("graphs-valid"), DataFold.VALIDATION))
示例#8
0
    def test_copy_from(self):
        with self._setup_test() as az_info, TemporaryDirectory() as tmp_dir:
            elements = [[i, i//2] for i in range(10000)]
            tmp_local_path = RichPath.create(tmp_dir).join("sample.json.gz")
            tmp_local_path.save_as_compressed_file(elements)

            remote_path1 = RichPath.create(f"azure://devstoreaccount1/test1/sample1.json.gz", az_info)
            self.assertFalse(remote_path1.exists())

            remote_path1.copy_from(tmp_local_path)
            tmp_local_path.delete()

            self.assertFalse(tmp_local_path.exists())
            self.assertTrue(remote_path1.exists())

            read_elements = remote_path1.read_by_file_suffix()
            self.assertListEqual(elements, read_elements)

            remote_path2 = RichPath.create(f"azure://devstoreaccount1/test1/sample2.json.gz", az_info)
            remote_path2.copy_from(remote_path1)
            remote_path1.delete()

            read_elements = remote_path2.read_by_file_suffix()
            self.assertListEqual(elements, read_elements)

            read_elements = remote_path2.to_local_path().read_by_file_suffix()
            self.assertListEqual(elements, read_elements)
            remote_path2.delete()
示例#9
0
def get_staqc_dataset(path: RichPath) -> List[Dict[str, Any]]:
    codes = path.get_filtered_files_in_dir(
        'python*qid_by*code.pickle')[0].read_as_pickle()
    titles = path.get_filtered_files_in_dir(
        'python*qid_by*title.pickle')[0].read_as_pickle()
    data = chain([{
        'code':
        code,
        'code_tokens':
        tokenize_python_from_string(code, func_only=False).code_tokens,
        'docstring':
        titles[_id],
        'docstring_tokens':
        tokenize_docstring_from_string(titles[_id]),
        'language':
        'python'
    } for _id, code in codes.items()])

    filtered_data = filter_untokenizable_code(data)
    log_row_count_diff(original_data=codes.items(),
                       filtered_data=filtered_data,
                       label='StaQC')

    assert len(
        filtered_data
    ) > 0, 'No code tokens retrieved after applying filters for StaQC.'
    return filtered_data
示例#10
0
 def save(self, path: RichPath) -> None:
     """Save the model at a given location."""
     with TemporaryDirectory() as tmpdir:
         target_file = os.path.join(tmpdir, 'model.pkl.gz')
         with gzip.open(target_file, 'wb') as f:
             torch.save(self, f)
         path.copy_from(RichPath.create(target_file))
示例#11
0
def run_train(model_class: Type[Model],
              train_data_path: RichPath,
              valid_data_path: RichPath,
              save_folder: str,
              hyperparameters: Dict[str, Any],
              run_name: Optional[str]=None,
              quiet: bool=False) \
        -> RichPath:
    train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*')
    valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*')

    model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder)
    if os.path.exists(model.model_save_path):
        model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True)
        model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(
                                                                                                 hyperparameters)))
        resume = True
    else:
        model.load_existing_metadata(train_data_path.join('metadata.pkl.gz'))
        model.make_model(is_train=True)
        model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(hyperparameters)))
        resume = False
    model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume)
    return model_path
示例#12
0
def qm9_train_valid_paths(tmp_data_dir):
    """
    我们代码数据集的格式和qm9数据集的格式比较相似,可以按照qm9的格式进行处理
    主要包含三部分的数据
    图表示形式:点边点的三元组
    节点的特征,使用one-hot编码
    目标值(预测值)
    """
    train_valid_paths = [
        os.path.join(tmp_data_dir, f"{split}.jsonl.gz")
        for split in ["train", "valid"]
    ]

    data_samples = 5 * [{
        "graph": [(0, 1, 1)
                  ],  # Edge between vertices 0 and 1, with type 1. 使用点边点的形式表示
        "node_features": [[1, 0], [
            0, 1
        ]],  # Two nodes, with features of shape (2,). 使用one-hot形式编码的节点的特征
        "targets": [[1.0]
                    ],  # Target value for the graph. 图结构需要预测的目标能量值,是一个32位的浮点数
    }]

    for path in train_valid_paths:
        RichPath.create(path).save_as_compressed_file(data_samples)

    # 第一次调用时创建这些临时文件,第二次调用时删除它们
    yield train_valid_paths

    for path in train_valid_paths:
        os.remove(path)
    def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        logger.info(f"Starting to load data from {path}.")

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST}

        if DataFold.TRAIN in folds_to_load:
            data_file = path.join("train.jsonl.gz")
            self.vocab_source = self._build_vocab(
                dataset = [datapoint["graph"]["node_features"] for datapoint in data_file.read_by_file_suffix()],
                vocab_size=self.max_vocab_size
            )
            self.vocab_target = self._build_vocab(
                dataset = [datapoint["Target"] for datapoint in data_file.read_by_file_suffix()],
                vocab_size=self.max_vocab_size
            )
            self._loaded_data[DataFold.TRAIN] = self.__load_data(data_file)
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
    def __load_data(self, data_dir: RichPath,
                    data_fold: DataFold) -> List[GraphSample]:
        if data_fold == None:
            data_fold = "train"
        if data_fold == DataFold.TRAIN:
            data_name = "train"
        elif data_fold == DataFold.VALIDATION:
            data_name = "valid"
        elif data_fold == DataFold.TEST:
            data_name = "test"
        else:
            raise ValueError("Unknown data fold '%s'" % str(data_fold))

        print(" Loading DEOBFUSCATION %s data from %s." %
              (data_name, data_dir))

        if data_dir.join("%s-saved.pkl.gz" % data_name).is_file():
            read_data = data_dir.join("%s-saved.pkl.gz" %
                                      data_name).read_by_file_suffix()
            return read_data["all_graphs"], read_data["properties"]

        all_untensorised = data_dir.join("%s.pkl.gz" %
                                         data_name).read_by_file_suffix()

        graphs = all_untensorised["graphs"]

        properties = dict()
        properties["all_user_nodes"] = all_untensorised["name_to_id_mapping"]
        properties["user_defined_nodes_number"] = all_untensorised[
            "total_user_defined_nodes"]
        properties["edge_mapping"] = all_untensorised["edge_name_to_id"]
        properties["__num_labels"] = len(properties["all_user_nodes"])
        properties["__num_edge_types"] = len(properties["edge_mapping"])
        properties["__num_types"] = len(all_untensorised["type_to_id"])

        all_graphs = []
        for i in tqdm(range(len(graphs))):
            old_graph = graphs[i]
            if (old_graph["user_defined_nodes_number"] > 0):
                all_graphs.append(
                    self.create_graph_sample(
                        old_graph, properties["__num_edge_types"],
                        len(properties["all_user_nodes"])))

        print_graph_number = 2500
        print([
            all_untensorised["ids_to_names"][x]
            for x in all_graphs[print_graph_number].labels
        ], all_graphs[print_graph_number].nodes_mask)
        print(all_graphs[print_graph_number])
        to_save = dict()
        to_save["all_graphs"] = all_graphs
        to_save["properties"] = properties
        data_dir.join("%s-saved.pkl.gz" %
                      data_name).save_as_compressed_file(to_save)

        print("Saved modified data to %s-saved.pkl.gz" % data_name)

        return all_graphs, properties
示例#15
0
def jsonl_to_df(input_folder: RichPath, sample_percent: float, files_remaining: dict, azure_info_path) -> pd.DataFrame:
    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."

    assert input_folder.is_dir(), 'Argument supplied must be a directory'
    dfs = []
    all_files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
    
    sample_size = math.ceil(sample_percent * len(all_files))

    if sample_size > len(files_remaining):
        sample_size = len(files_remaining)
    files = random.sample(files_remaining.keys(), sample_size)
    replaced = [0 for x in range(sample_size)]

    while True:
        for i in range(len(files)):
            f = files[i]
            other_files = {x for x in files if x != f}
            if f not in files_remaining or len(set.intersection(files_remaining[f], other_files)) == len(files) - 1:
                replaced[i] = 1
                f = random.sample(files_remaining.keys(), 1)
                while f[0] in files:
                    f = random.sample(files_remaining.keys(), 1)
                files[i] = f[0]
            else:
                replaced[i] = 0
        if sum(replaced) < 2:
            break

    for f in files:
        files_remaining[f] = files_remaining[f].union({x for x in files if x != f})
        if len(files_remaining[f]) == len(all_files) - 1:
            del files_remaining[f]
        with open('files_remaining.txt', 'w+') as f:
            files_remaining_converted = {}
            
            for path in files_remaining:
                files_remaining_converted[path] = list(files_remaining[path])

            print(json.dumps(files_remaining_converted), file = f)

    assert files, 'There were no jsonl.gz files in the specified directory.'
    print(f'reading files from {input_folder.path}')
    project_map = {x:[] for x in files}
    print(project_map)
    for f in tqdm(files, total=len(files)):
        rich_f = RichPath.create(f, azure_info_path)
        lines = list(rich_f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))
        lines_with_docstrings = []

        for line in lines:
            if len(line['docstring_tokens']) > 0:
                lines_with_docstrings.append(line)
                
                if line['nwo'] not in project_map[str(rich_f)]:
                    project_map[str(rich_f)].append(line['nwo'])
        
        dfs.append(pd.DataFrame(lines_with_docstrings))
    return pd.concat(dfs), project_map
示例#16
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path)
    model_path = RichPath.create(arguments['MODEL_PATH'])
    type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'], azure_info_path)
    alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA'], azure_info_path)
    run_test(model_path, test_folder, type_lattice_path, alias_metadata_path,
             arguments['--print-predictions'])
def df_to_jsonl(df: pd.DataFrame,
                RichPath_obj: RichPath,
                i: int,
                basefilename='codedata') -> str:
    dest_filename = f'{basefilename}_{str(i).zfill(5)}.jsonl.gz'
    RichPath_obj.join(dest_filename).save_as_compressed_file(
        df.to_dict(orient='records'))
    return str(RichPath_obj.join(dest_filename))
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    test_folder = RichPath.create(arguments['TEST_DATA_PATH'], azure_info_path)
    model_path = RichPath.create(arguments['MODEL_PATH'])
    output_folder = arguments['OUTPUT_FOLDER']
    os.makedirs(output_folder, exist_ok=True)
    num_processes = int(arguments['--num-processes'])
    run_test(model_path, test_folder, output_folder, num_processes)
示例#19
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    predictions_path = RichPath.create(arguments['PREDICTIONS_JSONL_GZ'],
                                       azure_info_path)
    type_lattice_path = RichPath.create(arguments['TYPE_LATTICE_PATH'],
                                        azure_info_path)
    alias_metadata_path = RichPath.create(arguments['ALIAS_METADATA_PATH'],
                                          azure_info_path)
    compute(predictions_path, type_lattice_path, alias_metadata_path)
示例#20
0
    def test_simple_read_write(self):
        with self._setup_test() as az_info:
            remote_path = RichPath.create(
                "azure://devstoreaccount1/test1/remote_path.txt", az_info)
            with TemporaryDirectory() as tmp_dir:
                data_f = os.path.join(tmp_dir, 'testdata.txt')
                with open(data_f, 'w') as f:
                    f.write("hello!")
                local_path = RichPath.create(data_f)
                self.assertEqual(local_path.read_as_text(), "hello!")
                local_size = local_path.get_size()

                remote_path.copy_from(local_path)
                self.assertTrue(local_path.exists())
                local_path.delete()
                self.assertFalse(local_path.exists())
                local_path.delete()
                with self.assertRaises(Exception):
                    local_path.delete(missing_ok=False)

            self.assertEqual(remote_path.read_as_text(), "hello!")

            # Read once again (should trigger cache)
            self.assertEqual(remote_path.read_as_text(), "hello!")

            self.assertTrue(remote_path.exists())
            self.assertTrue(remote_path.is_file())
            self.assertFalse(remote_path.is_dir())
            self.assertEqual(local_size, remote_path.get_size())

            local_path = remote_path.to_local_path()
            self.assertTrue(local_path.exists())
            os.path.exists(local_path.path)
            with open(local_path.path, 'r') as f:
                self.assertEqual(f.read(), "hello!")

            # Delete file
            remote_path.delete()
            self.assertFalse(remote_path.exists())
            remote_path.delete()  # Should not raise Exception
            with self.assertRaises(FileNotFoundError):
                remote_path.delete(missing_ok=False)

            # Other random remote_path does not exist
            remote_path = RichPath.create(
                "azure://devstoreaccount1/test1/remote_path2.txt", az_info)
            self.assertFalse(remote_path.exists())
            self.assertFalse(remote_path.is_dir())
            self.assertFalse(remote_path.is_file())

            with self.assertRaises(Exception):
                remote_path.read_as_text()

            with self.assertRaises(Exception):
                remote_path.get_size()
示例#21
0
def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame:
    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."

    assert input_folder.is_dir(), 'Argument supplied must be a directory'
    dfs = []
    files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
    assert files, 'There were no jsonl.gz files in the specified directory.'
    print(f'reading files from {input_folder.path}')
    for f in tqdm(files, total=len(files)):
        dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))))
    return pd.concat(dfs)
示例#22
0
 def parse_jsonl_file(self, file: Path, output_folder: Path) -> None:
     print(f"Parsing data in {file}")
     input_file = RichPath.create(str(file))
     output_file = RichPath.create(
         str(output_folder / input_file.basename()))
     parsed_code = [
         self.process_data(self.extract_from_raw_data(raw_json_object))
         for raw_json_object in tqdm(input_file.read_by_file_suffix())
     ]
     print(f"Saving processed data in {output_file}")
     output_file.save_as_compressed_file(parsed_code)
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    type_lattice_file = RichPath.create(arguments['TYPE_LATTICE_FILE'],
                                        azure_info_path)
    output_folder = RichPath.create(arguments['OUTPUT_BASE_FOLDER'],
                                    azure_info_path)
    input_folders, input_folder_basenames = [], set()
    print(arguments['INPUT_FOLDER'])
    print(arguments['TYPE_LATTICE_FILE'])
    print(arguments['OUTPUT_BASE_FOLDER'])
    for input_folder_name in arguments['INPUT_FOLDER']:
        input_folder_basename = os.path.basename(input_folder_name)
        if input_folder_basename in input_folder_basenames:
            raise ValueError("Several input folders with same basename '%s'!" %
                             (input_folder_basename, ))
        input_folder_basenames.add(input_folder_basename)
        input_folder = RichPath.create(input_folder_name)
        assert input_folder.is_dir(), "%s is not a folder" % (input_folder, )
        input_folders.append(input_folder)

    model_class = get_model_class_from_name(arguments.get('--model', 'nag'))
    hyperparameters = model_class.get_default_hyperparameters()
    hypers_override = arguments.get('--hypers-override')
    if hypers_override is not None:
        hyperparameters.update(json.loads(hypers_override))

    model = model_class(hyperparameters, run_name=arguments.get('--run-name'))

    metadata_to_use = arguments.get('--metadata-to-use', None)
    if metadata_to_use is None:
        train_folder = input_folders[0]
        model.load_metadata(train_folder,
                            type_lattice_file,
                            max_num_files=int(arguments['--max-num-files']))
    else:
        metadata_path = RichPath.create(metadata_to_use, azure_info_path)
        model.load_existing_metadata(metadata_path)

    for_test = args.get('--for-test', False)
    model.make_model(is_train=not for_test)

    for input_folder in input_folders:
        input_folder_basename = input_folder.basename()
        this_output_folder = output_folder.join(input_folder_basename)
        this_output_folder.make_as_dir()
        model.tensorise_data_in_dir(input_folder,
                                    this_output_folder,
                                    for_test=for_test,
                                    max_num_files=int(
                                        arguments['--max-num-files']))
示例#24
0
    def test_connection_types(self):
        for auth_type in AuthType:
            with self.subTest(f"Test {auth_type}"), self._setup_test(auth_type=auth_type) as az_info, TemporaryDirectory() as tmp_dir:
                data_f = os.path.join(tmp_dir, 'testtext.txt')
                with open(data_f, 'w') as f:
                    f.write("hello!")
                local_path = RichPath.create(data_f)

                remote_path = RichPath.create("azure://devstoreaccount1/test1/test_text.txt", az_info)
                remote_path.copy_from(local_path)
                local_path.delete()

                self.assertEqual(remote_path.read_as_text(), "hello!")
                remote_path.delete()
示例#25
0
def start(train_data_path, valid_folder_path, save_folder_path, model_class):

    train_folder = RichPath.create(train_data_path)
    valid_folder = RichPath.create(valid_folder_path)
    os.makedirs(save_folder_path, exist_ok=True)

    assert train_folder.is_dir(), "%s is not a folder" % (train_folder,)
    assert valid_folder.is_dir(), "%s is not a folder" % (valid_folder,)

    model_class = get_model_class_from_name(model_class)
    hyperparameters = model_class.get_default_hyperparameters()
    hyperparameters['run_id'] = "%s-%s" % (model_class.__class__.__name__, time.strftime("%Y-%m-%d-%H-%M-%S"))

    return run_train(model_class, train_folder, valid_folder, save_folder_path, hyperparameters)
示例#26
0
    def save(self, path: RichPath) -> None:
        variables_to_save = list(set(self.__sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        weights_to_save = self.__sess.run(variables_to_save)
        weights_to_save = {var.name: value
                           for (var, value) in zip(variables_to_save, weights_to_save)}

        data_to_save = {
                         "model_type": type(self).__name__,
                         "hyperparameters": self.hyperparameters,
                         "metadata": self.__metadata,
                         "weights": weights_to_save,
                         "run_name": self.__run_name,
                       }

        path.save_as_compressed_file(data_to_save)
示例#27
0
def test_code_data_load():
    dataset_params = CodeDataset.get_default_hyperparameters()
    dataset = CodeDataset(dataset_params)
    dataset.load_data(RichPath.create(CODE_DATA_PATH),
                      folds_to_load={DataFold.TRAIN})
    for data_point in dataset._graph_iterator(DataFold.TRAIN):
        print(data_point.adjacency_lists)
示例#28
0
def run_export(model_path: RichPath, test_data_path: RichPath,
               output_folder: str):
    test_hyper_overrides = {
        'run_id': 'exporting',
        "dropout_keep_rate": 1.0,
    }

    data_chunks = test_data_path.get_filtered_files_in_dir('*gz')

    # Restore model
    model = model_restore_helper.restore(model_path,
                                         is_train=False,
                                         hyper_overrides=test_hyper_overrides)

    exporting = model.export_representations(data_chunks)

    os.makedirs(output_folder, exist_ok=True)
    with open(os.path.join(output_folder, 'vectors.tsv'), 'w') as vectors_file,\
            open(os.path.join(output_folder, 'metadata.tsv'), 'w') as metadata_file:

        metadata_file.write('varname\ttype\tkind\tprovenance\n')
        for annot in exporting:
            metadata_file.write(
                f'{assert_valid_str(annot.name)}\t{assert_valid_str(annot.type_annotation)}\t{assert_valid_str(annot.kind)}\t{assert_valid_str(annot.provenance)}\n'
            )
            vectors_file.write('\t'.join(str(e) for e in annot.representation))
            vectors_file.write('\n')
示例#29
0
def run(arguments):
    azure_info_path = arguments.get('--azure-info', None)
    input_folder = RichPath.create(arguments['INPUT_PATH'], azure_info_path)
    output_folder = RichPath.create(arguments['OUTPUT_PATH'], azure_info_path)

    with ChunkWriter(output_folder,
                     file_prefix='codedata',
                     max_chunk_size=500,
                     file_suffix='.jsonl.gz') as chunked_writer:
        for file in input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'):
            for line in file.read_by_file_suffix():
                tokens = line['code_tokens']
                chunked_writer.add(
                    dict(filename='%s:%s:%s' %
                         (line['repo'], line['path'], line['lineno']),
                         tokens=tokens))
示例#30
0
def jsonl_test_case():
    """
    加载data/test_datasets/..下的train.jsonl.gz文件和valid.jsonl.gz文件
    """
    dataset_params = JsonLGraphPropertyDataset.get_default_hyperparameters()
    dataset_params["num_fwd_edge_types"] = 4

    dataset = JsonLGraphPropertyDataset(dataset_params)
    data_path = RichPath.create(
        os.path.join(os.path.dirname(__file__), "..", "test_datasets"))
    dataset.load_data(data_path,
                      folds_to_load={DataFold.TRAIN, DataFold.VALIDATION})

    return TestCase(
        dataset=dataset,
        expected=TestExpectedValues(
            num_edge_types=dataset_params["num_fwd_edge_types"] + 1,
            node_feature_shape=(35, ),
            num_train_samples=10,
            num_valid_samples=10,
            labels_key_name="target_value",
            add_self_loop_edges=dataset_params["add_self_loop_edges"],
            tie_fwd_bkwd_edges=dataset_params["tie_fwd_bkwd_edges"],
            self_loop_edge_type=0,
        ),
    )