def test_cluster_with_one_vector(self): """ Test that the centroid of a cluster with a single vector has an equivalent centroid. """ v = Document("a", ["a", "b", "a", "c"], scheme=TF()) v.normalize() c = Cluster(v) self.assertEqual(v.dimensions, c.centroid.dimensions)
def test_get_centroid(self): """ Test getting the centroid. """ v = Document("", ["a", "c"], scheme=TF()) v.normalize() c = Cluster(v) self.assertTrue( all( round(v.dimensions[dimension], 10) == round( c.centroid.dimensions[dimension], 10) for dimension in v.dimensions.keys() | c.centroid.dimensions))
def _add_to_graph(self, graph, outgoing_links, threshold=0): """ Add the links to the graph. The function fetches the article text and uses it to add to the weighted graph. .. note:: The weight of edges is `1 - similarity`. The higher the similarity, the less weight. Therefore more paths go through that edge. :param graph: The graph to which to add the new nodes and edges. :type graph: :class:`~nx.Graph` :param outgoing_links: The dictionary of links. The keys should be the source articles. The values should be the outgoing links from these articles. :type outgoing_links: dict :param threshold: The minimum similarity between the source and target articles to add an edge between them. :type threshold: float """ """ Get the text from all articles. """ sources = list(outgoing_links.keys()) targets = [ link for link_set in outgoing_links.values() for link in link_set ] articles = text.collect(sources + targets, introduction_only=True) """ Convert each article into a document. The article is based only on the first sentence. """ documents = {} for title, introduction in articles.items(): introduction = self._remove_brackets(introduction) introduction = self._get_first_sentence(introduction) document = Document(introduction, self.tokenizer.tokenize(introduction), scheme=self.scheme) document.normalize() documents[title] = document """ Add first the nodes, and then the edges to the graph. This is done by going through all the outgoing links. If they have a page, the similarity between the source article and that link is computed. If the similarity exceeds the threshold, add an edge between the two. """ for source, targets in outgoing_links.items(): if source not in documents: continue if source not in graph.nodes: graph.add_node(source, document=documents[source]) for target in targets: if target not in documents: continue if target not in graph.nodes: graph.add_node(target, document=documents[target]) if source in documents and target in documents: similarity = vector_math.cosine(documents[source], documents[target]) if similarity > threshold: graph.add_edge(source, target, weight=(1 - similarity))