def load_data(self, path: RichPath, folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        logger.info(f"Starting to load data from {path}.")

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST}

        if DataFold.TRAIN in folds_to_load:
            data_file = path.join("train.jsonl.gz")
            self.vocab_source = self._build_vocab(
                dataset = [datapoint["graph"]["node_features"] for datapoint in data_file.read_by_file_suffix()],
                vocab_size=self.max_vocab_size
            )
            self.vocab_target = self._build_vocab(
                dataset = [datapoint["Target"] for datapoint in data_file.read_by_file_suffix()],
                vocab_size=self.max_vocab_size
            )
            self._loaded_data[DataFold.TRAIN] = self.__load_data(data_file)
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
    def load_data(self,
                  path: RichPath,
                  folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        logger.info(f"Starting to load data from {path}.")

        if self.metadata == {}:
            metadata_path = path.join("metadata.pkl.gz")
            if metadata_path.exists():
                logger.info(f"Loading metadata from {metadata_path}")
                self._metadata = metadata_path.read_by_file_suffix()
        else:
            logger.warning(
                "Using metadata passed to constructor, not metadata stored with data."
            )

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {
                DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST
            }

        if DataFold.TRAIN in folds_to_load:
            self._loaded_data[DataFold.TRAIN] = self.__load_data(
                path.join("train.jsonl.gz"))
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(
                path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(
                path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
示例#3
0
 def load_data(self, path: RichPath) -> None:
     # Note that as __load_data produces a generator, we explicitly force loading
     # (and caching) here:
     self._loaded_data[DataFold.TRAIN] = \
         list(self.__load_data(path.join("graphs-train"), DataFold.TRAIN))
     self._loaded_data[DataFold.VALIDATION] = \
         list(self.__load_data(path.join("graphs-valid"), DataFold.VALIDATION))
    def load_data(self,
                  path: RichPath,
                  folds_to_load: Optional[Set[DataFold]] = None) -> None:
        """Load the data from disk."""
        logger.info(f"Starting to load data from {path}.")
        self.load_metadata(path)

        # If we haven't defined what folds to load, load all:
        if folds_to_load is None:
            folds_to_load = {
                DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST
            }

        if DataFold.TRAIN in folds_to_load:
            self._loaded_data[DataFold.TRAIN] = self.__load_data(
                path.join("train.jsonl.gz"))
            logger.debug("Done loading training data.")
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(
                path.join("valid.jsonl.gz"))
            logger.debug("Done loading validation data.")
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(
                path.join("test.jsonl.gz"))
            logger.debug("Done loading test data.")
    def __load_data(self, data_dir: RichPath,
                    data_fold: DataFold) -> List[GraphSample]:
        if data_fold == None:
            data_fold = "train"
        if data_fold == DataFold.TRAIN:
            data_name = "train"
        elif data_fold == DataFold.VALIDATION:
            data_name = "valid"
        elif data_fold == DataFold.TEST:
            data_name = "test"
        else:
            raise ValueError("Unknown data fold '%s'" % str(data_fold))

        print(" Loading DEOBFUSCATION %s data from %s." %
              (data_name, data_dir))

        if data_dir.join("%s-saved.pkl.gz" % data_name).is_file():
            read_data = data_dir.join("%s-saved.pkl.gz" %
                                      data_name).read_by_file_suffix()
            return read_data["all_graphs"], read_data["properties"]

        all_untensorised = data_dir.join("%s.pkl.gz" %
                                         data_name).read_by_file_suffix()

        graphs = all_untensorised["graphs"]

        properties = dict()
        properties["all_user_nodes"] = all_untensorised["name_to_id_mapping"]
        properties["user_defined_nodes_number"] = all_untensorised[
            "total_user_defined_nodes"]
        properties["edge_mapping"] = all_untensorised["edge_name_to_id"]
        properties["__num_labels"] = len(properties["all_user_nodes"])
        properties["__num_edge_types"] = len(properties["edge_mapping"])
        properties["__num_types"] = len(all_untensorised["type_to_id"])

        all_graphs = []
        for i in tqdm(range(len(graphs))):
            old_graph = graphs[i]
            if (old_graph["user_defined_nodes_number"] > 0):
                all_graphs.append(
                    self.create_graph_sample(
                        old_graph, properties["__num_edge_types"],
                        len(properties["all_user_nodes"])))

        print_graph_number = 2500
        print([
            all_untensorised["ids_to_names"][x]
            for x in all_graphs[print_graph_number].labels
        ], all_graphs[print_graph_number].nodes_mask)
        print(all_graphs[print_graph_number])
        to_save = dict()
        to_save["all_graphs"] = all_graphs
        to_save["properties"] = properties
        data_dir.join("%s-saved.pkl.gz" %
                      data_name).save_as_compressed_file(to_save)

        print("Saved modified data to %s-saved.pkl.gz" % data_name)

        return all_graphs, properties
def df_to_jsonl(df: pd.DataFrame,
                RichPath_obj: RichPath,
                i: int,
                basefilename='codedata') -> str:
    dest_filename = f'{basefilename}_{str(i).zfill(5)}.jsonl.gz'
    RichPath_obj.join(dest_filename).save_as_compressed_file(
        df.to_dict(orient='records'))
    return str(RichPath_obj.join(dest_filename))
示例#7
0
def run_train(model_class: Type[Model],
              train_data_path: RichPath,
              valid_data_path: RichPath,
              save_folder: str,
              hyperparameters: Dict[str, Any],
              run_name: Optional[str]=None,
              quiet: bool=False) \
        -> RichPath:
    train_data_chunk_paths = train_data_path.get_filtered_files_in_dir('chunk_*')
    valid_data_chunk_paths = valid_data_path.get_filtered_files_in_dir('valid_chunk_*')

    model = model_class(hyperparameters, run_name=run_name, model_save_dir=save_folder, log_save_dir=save_folder)
    if os.path.exists(model.model_save_path):
        model = model_restore_helper.restore(RichPath.create(model.model_save_path), is_train=True)
        model.train_log("Resuming training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(
                                                                                                 hyperparameters)))
        resume = True
    else:
        model.load_existing_metadata(train_data_path.join('metadata.pkl.gz'))
        model.make_model(is_train=True)
        model.train_log("Starting training run %s of model %s with following hypers:\n%s" % (hyperparameters['run_id'],
                                                                                             model.__class__.__name__,
                                                                                             json.dumps(hyperparameters)))
        resume = False
    model_path = model.train(train_data_chunk_paths, valid_data_chunk_paths, quiet=quiet, resume=resume)
    return model_path
示例#8
0
    def __load_data(self, data_dir: RichPath, data_fold: DataFold):
        if data_fold == DataFold.TRAIN:
            data_name = "train"
        elif data_fold == DataFold.VALIDATION:
            data_name = "valid"
        elif data_fold == DataFold.TEST:
            data_name = "test"
        else:
            raise ValueError("Unknown data fold '%s'" % str(data_fold))
        print(" Loading observer %s data from %s." % (data_name, data_dir))

        graph_pickle_data = data_dir.join("%s_data_collection.pickle" %
                                          data_name).read_by_file_suffix()

        graph_id_to_edges: Dict[int, List[Tuple[int, int]]] = {}
        graph_id_to_features: Dict[int, List[np.ndarray]] = {}
        graph_id_to_targets: Dict[int, List[np.ndarray]] = {}
        #graph_id_to_node_offset: Dict[int, int] = {}

        for graph_id, graph in enumerate(
                graph_pickle_data, 1
        ):  #graph_pickle_data is a list of dictionary. graph_id start at 1
            graph_id_to_features[graph_id] = []
            graph_id_to_targets[graph_id] = []
            graph_id_to_edges[graph_id] = []
            for node_dict in graph['graph']['nodes']:
                graph_id_to_features[graph_id].append(
                    np.array(list(node_dict.values())))
            for target_dict in graph['actions'].values():
                graph_id_to_targets[graph_id].append(
                    np.array(list(target_dict.values())))
            #convert source, target node from id to position of nodes in the list
            for edge_dict in graph['graph']['links']:
                src_node, tgt_node = edge_dict['source'], edge_dict['target']
                graph_id_to_edges[graph_id].append((src_node, tgt_node))

        final_graphs = []
        for graph_id in graph_id_to_edges.keys():
            num_nodes = len(graph_id_to_features[graph_id])

            adjacency_lists, type_to_node_to_num_inedges = process_adjacency_lists(
                adjacency_lists=[graph_id_to_edges[graph_id]],
                num_nodes=num_nodes,
                add_self_loop_edges=self.params["add_self_loop_edges"],
                tied_fwd_bkwd_edge_types=self._tied_fwd_bkwd_edge_types,
            )

            final_graphs.append(
                TFAgentsGraphSample(
                    adjacency_lists=adjacency_lists,
                    type_to_node_to_num_inedges=type_to_node_to_num_inedges,
                    node_features=np.array(graph_id_to_features[graph_id]),
                    node_targets=np.array(graph_id_to_targets[graph_id]),
                ))

        return final_graphs
示例#9
0
    def load_data(self,
                  path: RichPath,
                  folds_to_load: Optional[Set[DataFold]] = None) -> None:

        # 如果没有定义加载数据属于哪个集,那么需要同时加载训练集、验证集和测试集
        if folds_to_load is None:
            folds_to_load = {
                DataFold.TRAIN, DataFold.VALIDATION, DataFold.TEST
            }

        if DataFold.TRAIN in folds_to_load:
            self._loaded_data[DataFold.TRAIN] = self.__load_data(
                path.join("train_token.json"))
        if DataFold.VALIDATION in folds_to_load:
            self._loaded_data[DataFold.VALIDATION] = self.__load_data(
                path.join("validate_token.json"))
        if DataFold.TEST in folds_to_load:
            self._loaded_data[DataFold.TEST] = self.__load_data(
                path.join("test_token.json"))
示例#10
0
    def load_metadata(self, path: RichPath) -> None:
        """Load the metadata for a dataset (such as vocabularies, names of properties, ...)
        from a path on disk.

        Note: Implementors needing to act on metadata information before loading any actual data
        should override this method.
        """
        if self.metadata == {}:
            metadata_path = path.join("metadata.pkl.gz")
            if metadata_path.exists():
                logger.info(f"Loading metadata from {metadata_path}")
                self._metadata = metadata_path.read_by_file_suffix()
        else:
            logger.warning("Using metadata passed to constructor, not metadata stored with data.")
示例#11
0
def split_many_files(input_dir: RichPath, output_dir: RichPath,
                     train_ratio: float, valid_ratio: float, test_ratio: float,
                     test_only_projects: Set[str]) -> None:
    output_paths = {}  # type: Dict[str, RichPath]
    for split_name in ['train', 'valid', 'test', 'test-only']:
        graph_dir_name_for_split_type = input_dir.basename() + '-' + split_name
        graph_dir_for_split_type = output_dir.join(
            graph_dir_name_for_split_type)
        output_paths[split_name] = graph_dir_for_split_type
        graph_dir_for_split_type.make_as_dir()

    pool = Pool()
    pool.starmap(split_file,
                 [(f, output_paths, train_ratio, valid_ratio, test_ratio,
                   test_only_projects)
                  for f in input_dir.get_filtered_files_in_dir('*')])

    return None
示例#12
0
    def __load_data(self, data_dir: RichPath,
                    data_fold: DataFold) -> List[GraphSample]:
        if data_fold == DataFold.TRAIN:
            data_name = "train"
        elif data_fold == DataFold.VALIDATION:
            data_name = "valid"
        elif data_fold == DataFold.TEST:
            data_name = "test"
        else:
            raise ValueError("Unknown data fold '%s'" % str(data_fold))
        print(" Loading PPI %s data from %s." % (data_name, data_dir))

        graph_json_data = data_dir.join("%s_graph.json" %
                                        data_name).read_by_file_suffix()
        node_to_features = data_dir.join("%s_feats.npy" %
                                         data_name).read_by_file_suffix()
        node_to_labels = data_dir.join("%s_labels.npy" %
                                       data_name).read_by_file_suffix()
        node_to_graph_id = data_dir.join("%s_graph_id.npy" %
                                         data_name).read_by_file_suffix()
        self.__initial_node_feature_size = node_to_features.shape[-1]
        self.__num_labels = node_to_labels.shape[-1]

        # We read in all the data in two steps:
        #  (1) Read features, labels and insert self-loop edges (edge type 0).
        #      Implicitly, this gives us the number of nodes per graph.
        #  (2) Read all edges, and shift them so that each graph starts with node 0.

        fwd_edge_type = 0
        self.__num_edge_types = 1
        if self.params['add_self_loop_edges']:
            self_loop_edge_type = self.__num_edge_types
            self.__num_edge_types += 1
        if not self.params['tie_fwd_bkwd_edges']:
            bkwd_edge_type = self.__num_edge_types
            self.__num_edge_types += 1

        graph_id_to_graph_data = {}  # type: Dict[int, GraphSample]
        graph_id_to_node_offset = {}
        num_total_nodes = node_to_features.shape[0]
        for node_id in range(num_total_nodes):
            graph_id = node_to_graph_id[node_id]
            # In case we are entering a new graph, note its ID, so that we can normalise everything to start at 0
            if graph_id not in graph_id_to_graph_data:
                graph_id_to_graph_data[graph_id] = \
                    GraphSample(adjacency_lists=[[] for _ in range(self.__num_edge_types)],
                                type_to_node_to_num_incoming_edges=[[] for _ in range(self.__num_edge_types)],
                                node_features=[],
                                node_labels=[])
                graph_id_to_node_offset[graph_id] = node_id
            cur_graph_data = graph_id_to_graph_data[graph_id]
            cur_graph_data.node_features.append(node_to_features[node_id])
            cur_graph_data.node_labels.append(node_to_labels[node_id])
            shifted_node_id = node_id - graph_id_to_node_offset[graph_id]
            if self.params['add_self_loop_edges']:
                cur_graph_data.adjacency_lists[self_loop_edge_type].append(
                    (shifted_node_id, shifted_node_id))
                cur_graph_data.type_to_node_to_num_incoming_edges[
                    self_loop_edge_type].append(1)

        # Prepare reading of the edges by setting counters to 0:
        for graph_data in graph_id_to_graph_data.values():
            num_graph_nodes = len(graph_data.node_features)
            graph_data.type_to_node_to_num_incoming_edges[
                fwd_edge_type] = np.zeros([num_graph_nodes], np.int32)
            if not self.params['tie_fwd_bkwd_edges']:
                graph_data.type_to_node_to_num_incoming_edges[
                    bkwd_edge_type] = np.zeros([num_graph_nodes], np.int32)

        for edge_info in graph_json_data['links']:
            src_node, tgt_node = edge_info['source'], edge_info['target']
            # First, shift node IDs so that each graph starts at node 0:
            graph_id = node_to_graph_id[src_node]
            graph_node_offset = graph_id_to_node_offset[graph_id]
            src_node, tgt_node = src_node - graph_node_offset, tgt_node - graph_node_offset

            cur_graph_data = graph_id_to_graph_data[graph_id]
            cur_graph_data.adjacency_lists[fwd_edge_type].append(
                (src_node, tgt_node))
            cur_graph_data.type_to_node_to_num_incoming_edges[fwd_edge_type][
                tgt_node] += 1
            if not self.params['tie_fwd_bkwd_edges']:
                cur_graph_data.adjacency_lists[bkwd_edge_type].append(
                    (tgt_node, src_node))
                cur_graph_data.type_to_node_to_num_incoming_edges[
                    bkwd_edge_type][src_node] += 1

        final_graphs = []
        for graph_data in graph_id_to_graph_data.values():
            # numpy-ize:
            adj_lists = []
            for edge_type_idx in range(self.__num_edge_types):
                adj_lists.append(
                    np.array(graph_data.adjacency_lists[edge_type_idx]))
            final_graphs.append(
                GraphSample(adjacency_lists=adj_lists,
                            type_to_node_to_num_incoming_edges=np.array(
                                graph_data.type_to_node_to_num_incoming_edges),
                            node_features=np.array(graph_data.node_features),
                            node_labels=np.array(graph_data.node_labels)))

        return final_graphs
 def load_eval_data_from_path(self, path: RichPath) -> Iterable[Any]:
     if path.path == self.default_data_path():
         path = path.join("graphs-test")
     return iter(self.__load_data(path, DataFold.TEST))
示例#14
0
    def tensorise_data_in_dir(self,
                              input_data_dir: RichPath,
                              output_dir: RichPath,
                              for_test: bool,
                              max_num_files: Optional[int]=None,
                              add_raw_data: bool=False,
                              return_num_original_samples: bool = False) \
            -> Union[List[RichPath], Tuple[List[RichPath], int]]:
        """
        Tensorises data in directory by sample-by-sample, generating "chunk" files of
        lists of tensorised samples that are then consumed in the split_data_into_minibatches
        pipeline to construct minibatches.

        Args:
            input_data_dir: Where to load the raw data from (should come from the extraction pipeline)
            output_dir: Where to store the data to.
            for_test: Flag indicating if the data is to be used for testing (which required additional tensorisation steps)
            max_num_files: Maximal number of files to load data from.
            add_raw_data: Flag indicating that the original data should be added to the tensorised data.
            return_num_original_samples: Flag indicating that the return value should contain the
             number of samples we tried to load, including those that we discarded (e.g., because
             they were too big)

        Return:
            List of paths to the generated chunk files, or tuple of that list and the number
            of samples loaded (iff return_num_original_samples was set)
        """
        data_files = get_data_files_from_directory(input_data_dir,
                                                   max_num_files)
        tensorisation_argument_tuples = []
        chunk_paths = []
        for (partition_idx, raw_graph_file_partition) in enumerate(
                partition_files_by_size(data_files, 40 * 1024 * 1024)):
            target_file = output_dir.join("chunk_%04i.pkl.gz" %
                                          (partition_idx, ))
            tensorisation_argument_tuples.append(
                (raw_graph_file_partition, target_file))
            chunk_paths.append(target_file)

        parsing_result_data = {"num_all_samples": 0, "num_used_samples": 0}
        data_file_parser_fn = make_data_file_parser(type(self),
                                                    self.hyperparameters,
                                                    self.metadata,
                                                    for_test=for_test,
                                                    add_raw_data=add_raw_data)

        def received_result_callback(result):
            (num_all_samples, num_used_samples) = result
            parsing_result_data['num_all_samples'] += num_all_samples
            parsing_result_data['num_used_samples'] += num_used_samples

        def finished_callback():
            pass

        run_jobs_in_parallel(tensorisation_argument_tuples,
                             data_file_parser_fn, received_result_callback,
                             finished_callback)

        # Store the metadata we used for this as well, so that we can re-use the results:
        metadata_path = output_dir.join("metadata.pkl.gz")
        metadata_path.save_as_compressed_file({
            "hyperparameters":
            self.hyperparameters,
            "metadata":
            self.__metadata,
            "num_used_samples":
            parsing_result_data['num_used_samples'],
            "num_all_samples":
            parsing_result_data['num_all_samples']
        })

        self.train_log(
            "Tensorised %i (%i before filtering) samples from '%s' into '%s'."
            % (parsing_result_data['num_used_samples'],
               parsing_result_data['num_all_samples'], input_data_dir,
               output_dir))

        if return_num_original_samples:
            return chunk_paths, parsing_result_data['num_all_samples']
        return chunk_paths
示例#15
0
 def load_eval_data_from_path(self, path: RichPath) -> Iterable[Any]:
     if path.path == self.default_data_path():
         path = path.join("test.jsonl.gz")
     return self.__load_data(path)
示例#16
0
 def load_data(self, path: RichPath) -> None:
     self._loaded_data[DataFold.TRAIN] = self.__load_data(
         path.join("train.jsonl.gz"))
     self._loaded_data[DataFold.VALIDATION] = self.__load_data(
         path.join("valid.jsonl.gz"))
示例#17
0
    def __load_data(self, data_dir: RichPath,
                    data_fold: DataFold) -> List[PPIGraphSample]:
        """
        ppi数据集包括4类文件,分别是
        XXX_feats.npy
        XXX_graph.json
        XXX_graph_id.npy
        XXX_labels.npy
        read_by_file_suffix() : 读取 npy、json、pkl、jsonl等类型的文件
        """
        if data_fold == DataFold.TRAIN:
            data_name = "train"
        elif data_fold == DataFold.VALIDATION:
            data_name = "valid"
        elif data_fold == DataFold.TEST:
            data_name = "test"
        else:
            raise ValueError("Unknown data fold '%s'" % str(data_fold))
        print(" Loading PPI %s data from %s." % (data_name, data_dir))

        graph_json_data = data_dir.join("%s_graph.json" %
                                        data_name).read_by_file_suffix()
        node_to_features = data_dir.join("%s_feats.npy" %
                                         data_name).read_by_file_suffix()
        node_to_labels = data_dir.join("%s_labels.npy" %
                                       data_name).read_by_file_suffix()
        node_to_graph_id = data_dir.join("%s_graph_id.npy" %
                                         data_name).read_by_file_suffix()

        # We read in all the data in two steps:
        #  (1) Read features and labels. Implicitly, this gives us the number of nodes per graph.
        # 每个节点的特征是50维,每个节点的label是121个
        #  (2) Read all edges, and shift them so that each graph starts with node 0.

        # 提取每个节点的边、特征、标签和节点
        graph_id_to_edges: Dict[int, List[Tuple[int, int]]] = {}
        graph_id_to_features: Dict[int, List[np.ndarray]] = {}
        graph_id_to_labels: Dict[int, List[np.ndarray]] = {}
        graph_id_to_node_offset: Dict[int, int] = {}

        num_total_nodes = node_to_features.shape[0]
        for node_id in range(num_total_nodes):
            graph_id = node_to_graph_id[node_id]

            # In case we are entering a new graph, note its ID, so that we can normalise everything to start at 0
            if graph_id not in graph_id_to_edges:
                graph_id_to_edges[graph_id] = []
                graph_id_to_features[graph_id] = []
                graph_id_to_labels[graph_id] = []
                graph_id_to_node_offset[graph_id] = node_id

            graph_id_to_features[graph_id].append(node_to_features[node_id])
            graph_id_to_labels[graph_id].append(node_to_labels[node_id])

        for edge_info in graph_json_data["links"]:
            src_node, tgt_node = edge_info["source"], edge_info["target"]
            # First, shift node IDs so that each graph starts at node 0:
            graph_id = node_to_graph_id[src_node]
            graph_node_offset = graph_id_to_node_offset[graph_id]
            src_node, tgt_node = src_node - graph_node_offset, tgt_node - graph_node_offset

            graph_id_to_edges[graph_id].append((src_node, tgt_node))

        final_graphs = []
        for graph_id in graph_id_to_edges.keys():
            num_nodes = len(graph_id_to_features[graph_id])

            adjacency_lists, type_to_node_to_num_inedges = process_adjacency_lists(
                adjacency_lists=[graph_id_to_edges[graph_id]],
                num_nodes=num_nodes,
                add_self_loop_edges=self.params["add_self_loop_edges"],
                tied_fwd_bkwd_edge_types=self._tied_fwd_bkwd_edge_types,
            )

            final_graphs.append(
                PPIGraphSample(
                    adjacency_lists=adjacency_lists,
                    type_to_node_to_num_inedges=type_to_node_to_num_inedges,
                    node_features=np.array(graph_id_to_features[graph_id]),
                    node_labels=np.array(graph_id_to_labels[graph_id]),
                ))

        return final_graphs
示例#18
0
    def load_data(cls, data_dir: RichPath,
                  data_fold: str) -> List[PPIGraphSample]:

        print(" Loading PPI %s data from %s." % (data_fold, data_dir))

        graph_json_data = data_dir.join("%s_graph.json" %
                                        data_fold).read_by_file_suffix()
        node_to_features = data_dir.join("%s_feats.npy" %
                                         data_fold).read_by_file_suffix()
        node_to_labels = data_dir.join("%s_labels.npy" %
                                       data_fold).read_by_file_suffix()
        node_to_graph_id = data_dir.join("%s_graph_id.npy" %
                                         data_fold).read_by_file_suffix()

        # We read in all the data in two steps:
        #  (1) Read features, labels. Implicitly, this gives us the number of nodes per graph.
        #  (2) Read all edges, and shift them so that each graph starts with node 0.
        fwd_edge_type = 0

        graph_id_to_graph_data: Dict[int, PPIGraphSample] = {}
        graph_id_to_node_offset: Dict[int, int] = {}
        num_total_nodes = node_to_features.shape[0]
        for node_id in range(num_total_nodes):
            graph_id = node_to_graph_id[node_id]
            # In case we are entering a new graph, note its ID, so that we can normalise everything to start at 0
            if graph_id not in graph_id_to_graph_data:
                graph_id_to_graph_data[graph_id] = PPIGraphSample(
                    adjacency_lists=[[]],
                    node_features=[],
                    node_labels=[],
                )
                graph_id_to_node_offset[graph_id] = node_id
            cur_graph_data = graph_id_to_graph_data[graph_id]
            cur_graph_data.node_features.append(node_to_features[node_id])
            cur_graph_data.node_labels.append(node_to_labels[node_id])

        for edge_info in graph_json_data["links"]:
            src_node, tgt_node = edge_info["source"], edge_info["target"]
            # First, shift node IDs so that each graph starts at node 0:
            graph_id = node_to_graph_id[src_node]
            graph_node_offset = graph_id_to_node_offset[graph_id]
            src_node, tgt_node = src_node - graph_node_offset, tgt_node - graph_node_offset

            cur_graph_data = graph_id_to_graph_data[graph_id]
            cur_graph_data.adjacency_lists[fwd_edge_type].append(
                (src_node, tgt_node))

        final_graphs = []
        for graph_data in graph_id_to_graph_data.values():
            # numpy-ize:
            adj_lists = [
                np.array(graph_data.adjacency_lists[fwd_edge_type],
                         dtype=np.int32)
            ]
            final_graphs.append(
                PPIGraphSample(
                    adjacency_lists=adj_lists,
                    node_features=np.array(graph_data.node_features,
                                           dtype=np.float32),
                    node_labels=np.array(graph_data.node_labels,
                                         dtype=np.bool),
                ))

        return final_graphs
示例#19
0
def _write_data(out_dir: RichPath, window_idx: int, chunk_size: int, data_window: List[Any]):
    np.random.shuffle(data_window)
    for chunk_idx, data_chunk in enumerate(chunked(data_window, chunk_size)):
        out_file = out_dir.join('chunk_%i-%i.jsonl.gz' % (window_idx, chunk_idx))
        out_file.save_as_compressed_file(data_chunk)