Пример #1
0
    def load_graph(self) -> Graph:
        """
        Loads graph nodes and edges into Ensmallen.
        Creates a node type list, as Ensmallen
        requires this to parse node types.
        :param graph_args: dict, output of main_graph_args
        :return: ensmallen Graph
        """

        graph_args_with_indir = self.main_graph_args()

        for pathtype in ['node_path', 'edge_path']:
            filepath = graph_args_with_indir[pathtype]
            if is_url(filepath):
                url_as_filename = \
                    ''.join(c if c in VALID_CHARS else "_" for c in filepath)
                outfile = os.path.join(self.outdir(), url_as_filename)
                download_file(filepath, outfile)
                graph_args_with_indir[pathtype] = outfile
            elif not is_valid_path(filepath):
                raise FileNotFoundError(f"Please check path: {filepath}")
        
        # Now load the Ensmallen graph
        loaded_graph = Graph.from_csv(**graph_args_with_indir)

        return loaded_graph
Пример #2
0
    def make_link_prediction_data(self, embedding_file: str,
                                  training_graph_args: dict,
                                  pos_validation_args: dict,
                                  neg_training_args: dict,
                                  neg_validation_args: dict,
                                  edge_method: str) -> Tuple[Tuple, Tuple]:
        """Prepare training and validation data for training link prediction classifers

        Args:
            embedding_file: path to embedding file for nodes in graph
            training_graph_args: EnsmallenGraph arguments to load training graph
            pos_validation_args: EnsmallenGraph arguments to load positive validation graph
            neg_training_args: EnsmallenGraph arguments to load negative training graph
            neg_validation_args: EnsmallenGraph arguments to load negative validation graph
            edge_method: edge embedding method to use (average, L1, L2, etc)
        Returns:
            A tuple of tuples

        """
        embedding = pd.read_csv(embedding_file, index_col=0, header=None)

        # load graphs
        graphs = {'pos_training': Graph.from_csv(**training_graph_args)}
        for name, graph_args in [('pos_validation', pos_validation_args),
                                 ('neg_training', neg_training_args),
                                 ('neg_validation', neg_validation_args)]:
            these_params = copy.deepcopy(training_graph_args)
            these_params.update(graph_args)
            graphs[name] = Graph.from_csv(**these_params)

        # create transformer object to convert graphs into edge embeddings
        lpt = LinkPredictionTransformer(method=edge_method)
        lpt.fit(embedding
                )  # pass node embeddings to be used to create edge embeddings
        train_edges, train_labels = lpt.transform(
            positive_graph=graphs['pos_training'],
            negative_graph=graphs['neg_training'])
        valid_edges, valid_labels = lpt.transform(
            positive_graph=graphs['pos_validation'],
            negative_graph=graphs['neg_validation'])
        return (train_edges, train_labels), (valid_edges, valid_labels)
Пример #3
0
    def test_make_tsne(self):
        yhelp = YamlHelper(
            "tests/resources/test_graph_embedding_bert_tsne.yaml")
        g = Graph.from_csv(nodes_column="id",
                           node_list_node_types_column="category",
                           default_node_type="biolink:NamedThing",
                           node_path=os.path.join(
                               yhelp.yaml['input_directory'],
                               yhelp.yaml['graph_data']['graph']['node_path']),
                           edge_path=os.path.join(
                               yhelp.yaml['input_directory'],
                               yhelp.yaml['graph_data']['graph']['edge_path']),
                           sources_column="subject",
                           destinations_column="object",
                           directed=False)

        tsne_kwargs = yhelp.make_tsne_args(graph=g)
        tsne_kwargs['embedding_file'] = 'tests/resources/test_embeddings.tsv'
        make_tsne(**tsne_kwargs)

        self.assertTrue(os.path.exists(self.expected_tsne_file))
Пример #4
0
def make_node_embeddings(
        embedding_outfile: str,
        embedding_history_outfile: str,
        main_graph_args: dict,
        node_embedding_params: dict,
        bert_columns: dict,
        bert_pretrained_model: str = "allenai/scibert_scivocab_uncased"
) -> None:
    """Make embeddings and output embeddings and training history

    Args:
        embedding_outfile: outfile to write out embeddings
        embedding_history_outfile: outfile to write out training history
        main_graph_args: arguments passed to ensmallen_graph for graph loading
        node_embedding_params: args passed to compute_node_embeddings() in Embiggen
        bert_columns: columns containing text info to use to make embeddings from Bert
                pretrained embeddings
    Returns:
        None.

    """
    # load main graph
    graph: Graph = Graph.from_csv(**main_graph_args)
    node_embedding, training_history = compute_node_embedding(
        graph, **node_embedding_params)

    # embed columns with BERT first (if we're gonna)
    bert_embeddings = pd.DataFrame()
    if bert_columns:
        bert_model = BertModel.from_pretrained(bert_pretrained_model,
                                               output_hidden_states=True)
        bert_tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model)
        bert_model.eval()
        all_bert_embeddings = bert_model.embeddings.word_embeddings.weight.data.numpy(
        )

        node_data = get_node_data(main_graph_args['node_path'])

        node_text = [
            " ".join([str(row[col])
                      for col in bert_columns]) for index, row in tqdm(
                          node_data.iterrows(), "extracting text from nodes")
        ]
        node_text_tokenized = [
            bert_tokenizer.encode(
                this_text,  # Sentence to encode
                # add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                return_tensors='np')
            for this_text in tqdm(node_text, "tokenzing text")
        ]
        node_text_tensors = [
            np.mean(all_bert_embeddings[ids.flatten()], axis=0) for ids in
            tqdm(node_text_tokenized, "extracting embeddings for tokens")
        ]

        bert_embeddings = pd.DataFrame(node_text_tensors,
                                       index=graph.get_node_names())

    if not bert_embeddings.empty:
        node_embedding = pd.concat([node_embedding, bert_embeddings],
                                   axis=1,
                                   ignore_index=False)

    if not training_history.empty:
        with open(embedding_history_outfile, 'w') as f:
            f.write(training_history.to_json())

    node_embedding.to_csv(embedding_outfile, header=False)
    return None
    def __call__(self) -> Graph:
        """Return Graph containing required graph."""
        graph_arguments = self.get_graph_arguments()
        root = self.get_preprocessed_graph_directory_path()

        if not self._cache and os.path.exists(root):
            shutil.rmtree(root)

        self.download()
        os.makedirs(root, exist_ok=True)

        # Call the provided callbacks to process the edge lists, if any.
        for callback, arguments in zip(self._callbacks, self._callbacks_arguments):
            callback(**{
                key: os.path.join(self._cache_path, value)
                if key.endswith("_path") else value
                for key, value in arguments.items()
            })

        # Preprocess the edge list to an optimal edge list
        # if this is enabled.
        if self._preprocess:
            # If any of the node types columns have been provided,
            # we compute the target node types column
            target_node_type_list_path = None
            if any(
                graph_arguments.get(column) is not None
                for column in (
                    "node_list_node_types_column_number",
                    "node_list_node_types_column",
                )
            ):
                target_node_type_list_path = self.get_preprocessed_graph_node_types_path()

            # If any of the edge types columns have been provided,
            # we compute the target edge types column
            target_edge_type_list_path = None
            if any(
                graph_arguments.get(column) is not None
                for column in (
                    "edge_list_edge_types_column_number",
                    "edge_list_edge_types_column",
                )
            ):
                target_edge_type_list_path = self.get_preprocessed_graph_edge_types_path()

            target_node_path = self.get_preprocessed_graph_nodes_path()
            target_edge_path = self.get_preprocessed_graph_edges_path()

            # If a node path was specified
            node_path = self.get_adjusted_graph_nodes_path()

            may_have_singletons = graph_arguments.get(
                "may_have_singletons", True
            ) and node_path is not None

            if not self.is_preprocessed():
                try:
                    (
                        node_types_number,
                        nodes_number,
                        edge_types_number,
                        edges_number
                    ) = edge_list_utils.build_optimal_lists_files(
                        # NOTE: the following parameters are supported by the parser, but
                        # so far we have not encountered a single use case where we actually used them.  
                        # original_node_type_path,
                        # original_node_type_list_separator,
                        # original_node_types_column_number,
                        # original_node_types_column,
                        # original_numeric_node_type_ids,
                        # original_minimum_node_type_id,
                        # original_node_type_list_header,
                        # original_node_type_list_support_balanced_quotes,
                        # original_node_type_list_rows_to_skip,
                        # original_node_type_list_max_rows_number,
                        # original_node_type_list_comment_symbol,
                        # original_load_node_type_list_in_parallel,
                        # original_node_type_list_is_correct,
                        # node_types_number,
                        target_node_type_list_path=target_node_type_list_path,
                        target_node_type_list_separator='\t',
                        target_node_type_list_node_types_column_number=0,
                        original_node_path=node_path,
                        original_node_list_header=graph_arguments.get(
                            "node_list_header"
                        ),
                        original_node_list_support_balanced_quotes=graph_arguments.get(
                            "node_list_support_balanced_quotes"
                        ),
                        node_list_rows_to_skip=graph_arguments.get(
                            "node_list_rows_to_skip"
                        ),
                        node_list_is_correct=graph_arguments.get(
                            "node_list_is_correct"
                        ),
                        node_list_max_rows_number=graph_arguments.get(
                            "node_list_max_rows_number"
                        ),
                        node_list_comment_symbol=graph_arguments.get(
                            "node_list_comment_symbol"
                        ),
                        default_node_type=graph_arguments.get(
                            "default_node_type"
                        ),
                        original_nodes_column_number=graph_arguments.get(
                            "nodes_column_number"
                        ),
                        original_nodes_column=graph_arguments.get(
                            "nodes_column"
                        ),
                        original_node_types_separator=graph_arguments.get(
                            "node_types_separator"
                        ),
                        original_node_list_separator=graph_arguments.get(
                            "node_list_separator"
                        ),
                        original_node_list_node_types_column_number=graph_arguments.get(
                            "node_list_node_types_column_number"
                        ),
                        original_node_list_node_types_column=graph_arguments.get(
                            "node_list_node_types_column"
                        ),
                        nodes_number=graph_arguments.get("nodes_number"),
                        # original_minimum_node_id,
                        # original_numeric_node_ids,
                        # original_node_list_numeric_node_type_ids,
                        original_skip_node_types_if_unavailable=True,
                        # It make sense to load the node list in parallel only when
                        # you have to preprocess the node types, since otherwise the nodes number
                        # would be unknown.
                        original_load_node_list_in_parallel=target_node_type_list_path is not None,
                        maximum_node_id=graph_arguments.get(
                            "maximum_node_id"
                        ),
                        target_node_path=target_node_path,
                        target_node_list_separator='\t',
                        target_nodes_column=graph_arguments.get(
                            "nodes_column"
                        ),
                        target_nodes_column_number=0,
                        target_node_list_node_types_column_number=1,
                        target_node_types_separator="|",
                        # original_edge_type_path,
                        # original_edge_type_list_separator,
                        # original_edge_types_column_number,
                        # original_edge_types_column,
                        # original_numeric_edge_type_ids,
                        # original_minimum_edge_type_id,
                        # original_edge_type_list_header,
                        # edge_type_list_rows_to_skip,
                        # edge_type_list_max_rows_number,
                        # edge_type_list_comment_symbol,
                        # load_edge_type_list_in_parallel=True,
                        # edge_type_list_is_correct,
                        # edge_types_number,
                        target_edge_type_list_path=target_edge_type_list_path,
                        target_edge_type_list_separator='\t',
                        target_edge_type_list_edge_types_column_number=0,
                        original_edge_path=os.path.join(
                            self._cache_path, graph_arguments["edge_path"]),
                        original_edge_list_header=graph_arguments.get(
                            "edge_list_header"
                        ),
                        original_edge_list_support_balanced_quotes=graph_arguments.get(
                            "edge_list_support_balanced_quotes"
                        ),
                        original_edge_list_separator=graph_arguments.get(
                            "edge_list_separator"
                        ),
                        original_sources_column_number=graph_arguments.get(
                            "sources_column_number"
                        ),
                        original_sources_column=graph_arguments.get(
                            "sources_column"
                        ),
                        original_destinations_column_number=graph_arguments.get(
                            "destinations_column_number"
                        ),
                        original_destinations_column=graph_arguments.get(
                            "destinations_column"
                        ),
                        original_edge_list_edge_types_column_number=graph_arguments.get(
                            "edge_list_edge_types_column_number"
                        ),
                        original_edge_list_edge_types_column=graph_arguments.get(
                            "edge_list_edge_types_column"
                        ),
                        default_edge_type=graph_arguments.get(
                            "default_edge_type"
                        ),
                        original_weights_column_number=graph_arguments.get(
                            "weights_column_number"
                        ),
                        original_weights_column=graph_arguments.get(
                            "weights_column"
                        ),
                        default_weight=graph_arguments.get(
                            "default_weight"
                        ),
                        original_edge_list_numeric_node_ids=graph_arguments.get(
                            "edge_list_numeric_node_ids"
                        ),
                        skip_weights_if_unavailable=graph_arguments.get(
                            "skip_weights_if_unavailable"
                        ),
                        skip_edge_types_if_unavailable=graph_arguments.get(
                            "skip_edge_types_if_unavailable"
                        ),
                        edge_list_comment_symbol=graph_arguments.get(
                            "edge_list_comment_symbol"
                        ),
                        edge_list_max_rows_number=graph_arguments.get(
                            "edge_list_max_rows_number"
                        ),
                        edge_list_rows_to_skip=graph_arguments.get(
                            "edge_list_rows_to_skip"
                        ),
                        load_edge_list_in_parallel=True,
                        edges_number=graph_arguments.get("edges_number"),
                        target_edge_path=target_edge_path,
                        target_edge_list_separator='\t',
                        sort_temporary_directory=self._sort_tmp_dir,
                        directed=self._directed,
                        verbose=self._verbose > 0,
                        name=self._name,
                    )
                except Exception as e:
                    raise RuntimeError(
                        f"Something went wrong while preprocessing the graph {self._name}, "
                        f"version {self._version}, "
                        f"retrieved from the {self._repository} repository. "
                        "This is NOT the loading step, but a preprocessing step "
                        "that loads remote data from third parties. "
                        "As such there may have been some changes in the remote data "
                        "that may have made them incompatible with the current "
                        "expected parametrization. "
                        "Do open up an issue in the Ensmallen's GitHub repository reporting also the complete"
                        "exception of this error to help us keep the automatic graph retrieval "
                        "in good shape. Thank you!"
                    ) from e
                # Store the obtained metadata
                self.store_preprocessed_metadata(
                    node_types_number,
                    nodes_number,
                    edge_types_number,
                    edges_number
                )
            # Load the stored metadata
            metadata = self.get_preprocessed_metadata()
            # If the node types are provided
            has_node_types = metadata["node_types_number"] is not None
            if has_node_types and self._load_node_types:
                node_types_arguments = {
                    "node_type_path": target_node_type_list_path,
                    "node_types_column_number": 0,
                    "node_type_list_is_correct": True,
                    "node_type_list_separator": "\t",
                    "node_types_separator": "|",
                    "node_list_node_types_column_number": 1,
                    "node_list_numeric_node_type_ids": True,
                    "skip_node_types_if_unavailable": True,
                }
            else:
                node_types_arguments = {}
            # If the nodes are to be loaded
            if self._load_nodes:
                nodes_arguments = {
                    "node_path": target_node_path,
                    "node_list_separator": "\t",
                    "nodes_column_number": 0,
                    "node_list_is_correct": True,
                    **node_types_arguments
                }
            else:
                nodes_arguments = {
                    "numeric_node_ids": True,
                }

            # If the edge types are provided
            has_edge_types = metadata["edge_types_number"] is not None
            if has_edge_types:
                edge_types_arguments = {
                    "edge_type_path": target_edge_type_list_path,
                    "edge_types_column_number": 0,
                    "edge_type_list_is_correct": True,
                    "edge_type_list_separator": "\t",
                    "edge_list_edge_types_column_number": 2,
                    "edge_list_numeric_edge_type_ids": True,
                    "skip_edge_types_if_unavailable": True,
                }
            else:
                edge_types_arguments = {}

            has_edge_weights = any(
                column in graph_arguments
                for column in (
                    "weights_column_number",
                    "weights_column",
                    "default_weight"
                )
            )
            if has_edge_weights and self._load_edge_weights:
                edge_weights_arguments = {
                    "weights_column_number": 2 + int(metadata["edge_types_number"] is not None),
                    "skip_weights_if_unavailable": True,
                }
            else:
                edge_weights_arguments = {}

            try:
                # Load the graph
                graph = Graph.from_csv(**{
                    **metadata,
                    **nodes_arguments,
                    **edge_types_arguments,
                    **edge_weights_arguments,

                    "edge_path": target_edge_path,
                    "edge_list_header": False,
                    "sources_column_number": 0,
                    "destinations_column_number": 1,
                    "edge_list_numeric_node_ids": True,
                    "edge_list_is_complete": True,
                    "edge_list_may_contain_duplicates": False,
                    "edge_list_is_sorted": True,
                    "edge_list_is_correct": True,
                    "edges_number": metadata["edges_number"],
                    "nodes_number": metadata["nodes_number"],
                    "may_have_singletons": may_have_singletons,
                    "verbose": self._verbose > 0,
                    "directed": self._directed,
                    "name": self._name,
                })
            except Exception as e:
                raise RuntimeError(
                    f"Something went wrong while loading the graph {self._name}, "
                    f"version {self._version}, "
                    f"retrieved from the {self._repository} repository. "
                    "Do note that the preprocessing step of the graph has "
                    "completed without apparent errors. "
                    "This is likely something wrong with the Ensmallen library "
                    "so do please open an issue about the error you have encountered "
                    "in the Ensmallen's GitHub repository reporting also the complete "
                    "exception of this error. Thank you!"
                ) from e
        else:
            # Otherwise just load the graph.
            graph = Graph.from_csv(**{
                **{
                    key: os.path.join(self._cache_path, value)
                    if key.endswith("_path") else value
                    for key, value in graph_arguments.items()
                },
                "directed": self._directed,
                "verbose": self._verbose > 0,
                "name": self._name,
                **self._graph_kwargs,
            })
        if self._auto_enable_tradeoffs and graph.get_number_of_unique_edges() < 50e6:
            graph.enable()
        return graph
Пример #6
0
    def __call__(self) -> Graph:
        """Return Graph containing required graph."""
        graph_arguments = self.get_graph_arguments()
        root = self.get_preprocessed_graph_directory_path()

        if not self._cache and os.path.exists(root):
            shutil.rmtree(root)

        paths = self.get_adjusted_graph_paths()
        if not os.path.exists(root):
            # Download the necessary data
            self._downloader.download(
                self._graph["urls"],
                paths
            )

        os.makedirs(root, exist_ok=True)

        node_type_list_path = self.get_preprocessed_graph_node_types_path()
        edge_type_list_path = self.get_preprocessed_graph_edge_types_path()
        node_path = self.get_preprocessed_graph_nodes_path()
        edge_path = self.get_preprocessed_graph_edges_path()

        if not self.is_preprocessed():
            (
                node_types_number,
                nodes_number,
                edges_number
            ) = edge_list_utils.parse_wikipedia_graph(
                source_path=paths[0].replace(".bz2", ""),
                edge_path=edge_path,
                node_path=node_path,
                node_type_path=node_type_list_path,
                edge_type_path=edge_type_list_path,
                node_list_separator="\t",
                node_type_list_separator="\t",
                edge_type_list_separator="\t",
                node_types_separator="|",
                nodes_column="node_names",
                node_types_column="node_type_names",
                node_list_node_types_column="node_type_names",
                edge_types_column="edge_type_names",
                node_descriptions_column="node_descriptions",
                edge_list_separator="\t",
                keep_nodes_without_descriptions=self._keep_nodes_without_descriptions,
                keep_nodes_without_categories=self._keep_nodes_without_categories,
                keep_interwikipedia_nodes=self._keep_interwikipedia_nodes,
                keep_external_nodes=self._keep_external_nodes,
                compute_node_description=self._compute_node_description,
                sort_temporary_directory=self._sort_tmp_dir,
                directed=self._directed,
                verbose=self._verbose > 0,
            )
            # Store the obtained metadata
            self.store_preprocessed_metadata(
                node_types_number,
                nodes_number,
                None,
                edges_number
            )
        # Load the stored metadata
        metadata = self.get_preprocessed_metadata()
        # If the node types are provided
        if self._load_node_types:
            node_types_arguments = {
                "node_type_path": node_type_list_path,
                "node_types_number": metadata["node_types_number"],
                "node_types_column": "node_type_names",
                "node_type_list_is_correct": True,
                "node_type_list_separator": "\t",
                "node_types_separator": "|",
                "node_list_node_types_column_number": 1,
                "node_list_numeric_node_type_ids": True,
            }
        else:
            node_types_arguments = {}
        # If the nodes are to be loaded
        if self._load_nodes:
            nodes_arguments = {
                "node_path": node_path,
                "node_list_separator": "\t",
                "nodes_column": "node_names",
                "node_list_is_correct": True,
                **node_types_arguments
            }
        else:
            nodes_arguments = {
                "numeric_node_ids": True,
            }

        # If the edge types are provided
        edge_types_arguments = {
            "edge_type_path": edge_type_list_path,
            "edge_types_number": metadata["edge_types_number"],
            "edge_types_column_number": 0,
            "edge_type_list_is_correct": True,
            "edge_type_list_separator": "\t",
            "edge_list_edge_types_column_number": 2,
            "edge_list_numeric_edge_type_ids": True
        }

        # Load the graph
        graph = Graph.from_csv(**{
            **metadata,
            **graph_arguments,
            **nodes_arguments,
            **edge_types_arguments,

            "edge_path": edge_path,
            "edge_list_header": False,
            "sources_column_number": 0,
            "destinations_column_number": 1,
            "edge_list_numeric_node_ids": True,
            "edge_list_is_complete": True,
            "edge_list_may_contain_duplicates": False,
            "edge_list_is_sorted": True,
            "edge_list_is_correct": True,
            "edges_number": metadata["edges_number"],
            "nodes_number": metadata["nodes_number"],
            "may_have_singletons": True,
            "verbose": self._verbose > 0,
            "directed": self._directed,
            "name": self._name,
        })
        if self._auto_enable_tradeoffs and graph.get_number_of_unique_edges() < 50e6:
            graph.enable()
        return graph