Exemplo n.º 1
0
    def test_size(self):
        """
        Test retrieving the size of a cluster.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(len(v), c.size())
Exemplo n.º 2
0
    def test_year_check_range(self):
        """
        Test that when checking for a year in a range, the function returns `True`.
        """

        article = '2019–20 Premier League'
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertTrue(resolver._has_year(article))

        article = '2019-20 Premier League'
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertTrue(resolver._has_year(article))
Exemplo n.º 3
0
    def test_intra_similarity_of_cluster(self):
        """
        Test that the intra-similarity of a cluster with several vectors is equivalent to the average similarity.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual((c.similarity(v[0]) + c.similarity(v[1])) / 2.,
                         c.get_intra_similarity())
Exemplo n.º 4
0
    def test_year_check_range(self):
        """
        Test that when checking for a year in a range, the function returns `True`.
        """

        article = '2019–20 Premier League'
        extrapolator = WikipediaExtrapolator([], Tokenizer(), TF())
        self.assertTrue(extrapolator._has_year(article))

        article = '2019-20 Premier League'
        extrapolator = WikipediaExtrapolator([], Tokenizer(), TF())
        self.assertTrue(extrapolator._has_year(article))
Exemplo n.º 5
0
    def test_setting_vectors(self):
        """
        Test setting the vectors manually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster()
        self.assertEqual({}, c.centroid.dimensions)
        c.vectors = v
        self.assertEqual(v, c.vectors)
Exemplo n.º 6
0
    def test_get_representative_vectors(self):
        """
        Test ranking the vectors according to their similarity to the cluster.
        """

        v = [
            Document("", ['a', 'b', 'c'], scheme=TF()),
            Document("", ['a', 'a', 'c'], scheme=TF()),
            Document("", ['p'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(list, type(c.get_representative_vectors(2)))
        self.assertEqual([v[1], v[0]], c.get_representative_vectors(2))
Exemplo n.º 7
0
    def test_cluster_with_several_vectors(self):
        """
        Test creating a cluster with several vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)
Exemplo n.º 8
0
    def test_set_vectors_none(self):
        """
        Test that setting vectors to ``None`` overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        c.vectors = None
        self.assertEqual([], c.vectors)
        self.assertEqual({}, c.centroid.dimensions)
Exemplo n.º 9
0
    def test_score_relevance(self):
        """
        Test that when two documents are provided, one more relevant than the other, the score reflects it.
        """

        tokenizer = Tokenizer(min_length=2, stem=True)
        candidate = "Ronaldo"
        candidate_document = Document(candidate, tokenizer.tokenize(candidate))
        text = "Ronaldo, speaking after Juventus' victory, says Serie A is still wide open"
        domain = Document(text, tokenizer.tokenize(text))

        title_1 = "Cristiano Ronaldo"
        text_1 = "Cristiano Ronaldo is a Portuguese professional footballer who plays as a forward for Serie A club Juventus."
        title_document_1 = Document(title_1, tokenizer.tokenize(title_1))
        sentence_document_1 = Document(text_1, tokenizer.tokenize(text_1))

        title_2 = "Ronaldo"
        text_2 = "Ronaldo is a Brazilian former professional footballer who played as a striker."
        title_document_2 = Document(title_2, tokenizer.tokenize(title_2))
        sentence_document_2 = Document(text_2, tokenizer.tokenize(text_2))

        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        score_1 = resolver._compute_score(candidate_document, title_document_1,
                                          domain, sentence_document_1)
        score_2 = resolver._compute_score(candidate_document, title_document_2,
                                          domain, sentence_document_2)
        self.assertGreater(score_1, score_2)
Exemplo n.º 10
0
    def __init__(self, text='', dimensions=None, scheme=None, *args, **kwargs):
        """
        Initialize the document with the text and, optionally, the underlying vector's dimensions.

        If you only have tokens generated by a :class:`~nlp.tokenizer.Tokenizer`, you can pass them on as dimensions.
        In this case, the :class:`~nlp.document.Document` creates dimensions using the :class:`~nlp.weighting.tf.TF` term-weighting scheme.
        If you want to use a different term-weighting scheme, pass it on using the ``scheme`` parameter.

        Any other arguments or keyword arguments are passed on to the :class:`~vsm.vector.Vector` constructor.
        You can use the keyword arguments to pass on any optional attributes.

        :param text: The document's text.
        :type text: str
        :param dimensions: The initial dimensions of the document.
                           If a list is provided, it is assumed that they are tokens.
                           The dimensions are then created from this list using the given scheme.
        :type dimensions: list or dict
        :param scheme: The term-weighting scheme that is used to convert the tokens into dimensions.
                       If ``None`` is given, the :class:`~nlp.weighting.TermWeighting.TF` term-weighting scheme is used.
        :type scheme: None or :class:`~nlp.weighting.TermWeighting`
        """
        """
        If a list is provided, assume that it is a list of tokens.
        This list of tokens is converted into a dictionary representing the dimensions of the vector.
        The conversion is carried out by the term-weighting scheme.
        """
        if type(dimensions) is list:
            from nlp.weighting.tf import TF  # NOTE: The import is located here because of circular dependencies
            scheme = scheme if scheme is not None else TF()
            dimensions = scheme.create(dimensions).dimensions

        super(Document, self).__init__(dimensions, *args, **kwargs)
        self.text = text
Exemplo n.º 11
0
    def test_extrapolate_returns_related_participants(self):
        """
        Test that when extrapolating, related participants are returned.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "The LigaPro is the second-highest division of the Portuguese football league system.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
        extrapolator = WikipediaExtrapolator(corpus,
                                             tokenizer,
                                             TF(),
                                             first_level_links=15,
                                             second_level_links=15)
        participants = extrapolator.extrapolate([
            'Associação Académica de Coimbra – O.A.F.',
            'Académico de Viseu F.C.', 'S.L. Benfica B', 'FC Porto B'
        ])

        other_participants = [
            'Casa Pia A.C.', 'G.D. Chaves', 'C.D. Cova da Piedade',
            'S.C. Covilhã', 'G.D. Estoril Praia', 'S.C. Farense',
            'C.D. Feirense', 'Leixões S.C.', 'C.D. Mafra', 'C.D. Nacional',
            'U.D. Oliveirense', 'F.C. Penafiel', 'Varzim S.C.',
            'U.D. Vilafranquense'
        ]
        self.assertGreaterEqual(
            len(set(participants).intersection(set(other_participants))), 4)
Exemplo n.º 12
0
    def test_set_one_vectors(self):
        """
        Test that setting vectors to a single vector overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        n = Document("", ['a'], scheme=TF())
        c.vectors = n
        self.assertEqual([n], c.vectors)
        self.assertEqual(n.dimensions, c.centroid.dimensions)
Exemplo n.º 13
0
    def test_add_to_graph_low_threshold(self):
        """
        Test adding nodes and edges to a graph with a low threshold.
        """

        graph = nx.Graph()
        links = {
            'Olympique Lyonnais': ['Ligue 1', 'AS Monaco'],
        }

        tokenizer = Tokenizer(stem=True, stopwords=stopwords.words('english'))
        extrapolator = WikipediaExtrapolator([], tokenizer, TF())
        extrapolator._add_to_graph(graph, links, threshold=0)
        self.assertEqual(3, len(graph.nodes))
        self.assertEqual(2, len(graph.edges))
        self.assertTrue('Olympique Lyonnais' in graph.nodes)
        self.assertTrue(
            len(graph.nodes['Olympique Lyonnais']['document'].dimensions))
        self.assertTrue('Ligue 1' in graph.nodes)
        self.assertTrue('AS Monaco' in graph.nodes)
        self.assertTrue(('Olympique Lyonnais', 'Ligue 1') in graph.edges)
        self.assertTrue(('Olympique Lyonnais', 'AS Monaco') in graph.edges)
        self.assertFalse(('Ligue 1', 'AS Monaco') in graph.edges)
        self.assertGreater(
            graph.edges[('Olympique Lyonnais', 'Ligue 1')]['weight'], 0)
Exemplo n.º 14
0
    def test_edge_centrality_multiple(self):
        """
        Test that the edge centrality correctly identifies the most central edge when there are two such edges.
        This edge should be the one with the lowest weight.
        """

        nodes = ['A', 'B', 'C', 'D', 'W', 'X', 'Y', 'Z']
        edges = {
            ('A', 'B', 0.1),
            ('A', 'C', 0.1),
            ('A', 'D', 0.1),
            ('B', 'C', 0.1),
            ('B', 'D', 0.1),
            ('C', 'D', 0.1),
            ('W', 'X', 0.1),
            ('W', 'Y', 0.1),
            ('W', 'Z', 0.1),
            ('X', 'Y', 0.1),
            ('X', 'Z', 0.1),
            ('Y', 'Z', 0.1),
            ('D', 'W', 0.1),
            ('C', 'X', 0.05),
        }

        graph = nx.Graph()
        graph.add_nodes_from(nodes)
        graph.add_weighted_edges_from(edges)

        extrapolator = WikipediaExtrapolator([], Tokenizer(), TF())
        self.assertEqual(('C', 'X'), extrapolator._most_central_edge(graph))
Exemplo n.º 15
0
    def test_year_check_long_number(self):
        """
        Test that when checking for a year with a long number, the function does not detect a year.
        """

        article = '1234567890'
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertFalse(resolver._has_year(article))
Exemplo n.º 16
0
    def test_year_check(self):
        """
        Test that when checking for a year, the function returns a boolean.
        """

        article = 'Youssouf Koné (footballer, born 1995)'
        extrapolator = WikipediaExtrapolator([], Tokenizer(), TF())
        self.assertTrue(extrapolator._has_year(article))
Exemplo n.º 17
0
    def test_set_several_vectors(self):
        """
        Test that setting vectors to several vectors overwrites existing vectors.
        """

        v = Document("", ['a'], scheme=TF())
        c = Cluster(v)
        self.assertEqual([v], c.vectors)
        self.assertEqual(v.dimensions, c.centroid.dimensions)

        n = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        c.vectors = n
        self.assertEqual(n, c.vectors)
Exemplo n.º 18
0
    def test_get_first_sentence_full_without_period(self):
        """
        Test that when getting the first sentence from a text that has only one sentence, but without punctuation, the whole text is returned.
        """

        text = "Youssouf Koné (born 5 July 1995) is a Malian professional footballer who plays for French side Olympique Lyonnais and the Mali national team as a left-back"
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertEqual(text, resolver._get_first_sentence(text))
Exemplo n.º 19
0
    def test_centroid_normalized(self):
        """
        Test that the centroid is normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
Exemplo n.º 20
0
    def test_get_first_sentence_empty(self):
        """
        Test that when getting the first sentence from an empty string, an empty string is returned.
        """

        text = ""
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertEqual(text, resolver._get_first_sentence(text))
Exemplo n.º 21
0
    def test_year_check(self):
        """
        Test that when checking for a year, the function returns a boolean.
        """

        article = 'Youssouf Koné (footballer, born 1995)'
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertTrue(resolver._has_year(article))
Exemplo n.º 22
0
    def test_intra_similarity_of_cluster_with_single_vector(self):
        """
        Test that the intra-similarity of a cluster with a single vector is equivalent to that vector's similarity with the cluster.
        """

        v = Document("", ['a', 'b'], scheme=TF())
        c = Cluster(v)
        self.assertEqual(c.similarity(v), c.get_intra_similarity())
Exemplo n.º 23
0
    def test_empty_cluster_similarity(self):
        """
        Test that when calculating the similarity between a vector and an empty cluster, the similarity is 0.
        """

        c = Cluster()
        v = Document("", ["a", "c"], scheme=TF())
        self.assertEqual(0, c.similarity(v))
Exemplo n.º 24
0
    def test_year_check_long_number(self):
        """
        Test that when checking for a year with a long number, the function does not detect a year.
        """

        article = '1234567890'
        extrapolator = WikipediaExtrapolator([], Tokenizer(), TF())
        self.assertFalse(extrapolator._has_year(article))
Exemplo n.º 25
0
    def test_resolve_empty(self):
        """
        Test that when resolving an empty set of candidates, the resolver returns empty lists.
        """

        resolver = WikipediaNameResolver(TF(), Tokenizer(), 0, [ ])
        resolved, unresolved = resolver.resolve({ })
        self.assertFalse(len(resolved))
        self.assertFalse(len(unresolved))
Exemplo n.º 26
0
    def test_remove_vectors(self):
        """
        Test removing vectors from a cluster gradually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)

        c = Cluster(v)
        c.vectors.remove(v[1])
        self.assertEqual([v[0]], c.vectors)
        c.vectors.remove(v[0])
        self.assertEqual([], c.vectors)
Exemplo n.º 27
0
    def test_add_vectors(self):
        """
        Test adding vectors to a cluster gradually.
        """

        c = Cluster()
        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        self.assertEqual({}, c.centroid.dimensions)

        c.vectors.append(v[0])
        self.assertEqual([v[0]], c.vectors)

        c.vectors.append(v[1])
        self.assertEqual(v, c.vectors)
Exemplo n.º 28
0
    def test_remove_unclosed_brackets(self):
        """
        Test that when removing brackets that are not closed, they are not removed.
        """

        article = 'Youssouf Koné (footballer, born 1995'
        resolver = WikipediaSearchResolver(TF(), Tokenizer(), 0, [])
        self.assertEqual('Youssouf Koné (footballer, born 1995',
                         resolver._remove_brackets(article).strip())
Exemplo n.º 29
0
    def test_remove_unclosed_brackets(self):
        """
        Test that when removing brackets that are not closed, they are not removed.
        """

        article = 'Youssouf Koné (footballer, born 1995'
        extrapolator = WikipediaExtrapolator([], Tokenizer(), TF())
        self.assertEqual('Youssouf Koné (footballer, born 1995',
                         extrapolator._remove_brackets(article).strip())
Exemplo n.º 30
0
    def test_cluster_with_one_vector(self):
        """
        Test that the centroid of a cluster with a single vector has an equivalent centroid.
        """

        v = Document("a", ["a", "b", "a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertEqual(v.dimensions, c.centroid.dimensions)