Exemplo n.º 1
0
    def test_cluster_with_one_vector(self):
        """
        Test that the centroid of a cluster with a single vector has an equivalent centroid.
        """

        v = Document("a", ["a", "b", "a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertEqual(v.dimensions, c.centroid.dimensions)
Exemplo n.º 2
0
    def test_centroid_normalized_several_vectors(self):
        """
        Test that the centroid is always normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
        c.vectors.append(Document("", ["a", "b", "a", "d"]))
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
Exemplo n.º 3
0
    def test_size(self):
        """
        Test retrieving the size of a cluster.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(len(v), c.size())
Exemplo n.º 4
0
    def test_intra_similarity_of_cluster(self):
        """
        Test that the intra-similarity of a cluster with several vectors is equivalent to the average similarity.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual((c.similarity(v[0]) + c.similarity(v[1])) / 2.,
                         c.get_intra_similarity())
Exemplo n.º 5
0
    def test_get_representative_vectors(self):
        """
        Test ranking the vectors according to their similarity to the cluster.
        """

        v = [
            Document("", ['a', 'b', 'c'], scheme=TF()),
            Document("", ['a', 'a', 'c'], scheme=TF()),
            Document("", ['p'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(list, type(c.get_representative_vectors(2)))
        self.assertEqual([v[1], v[0]], c.get_representative_vectors(2))
Exemplo n.º 6
0
    def test_get_centroid(self):
        """
        Test getting the centroid.
        """

        v = Document("", ["a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertTrue(
            all(
                round(v.dimensions[dimension], 10) == round(
                    c.centroid.dimensions[dimension], 10)
                for dimension in v.dimensions.keys() | c.centroid.dimensions))
Exemplo n.º 7
0
    def test_setting_vectors(self):
        """
        Test setting the vectors manually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster()
        self.assertEqual({}, c.centroid.dimensions)
        c.vectors = v
        self.assertEqual(v, c.vectors)
Exemplo n.º 8
0
    def test_cluster_with_several_vectors(self):
        """
        Test creating a cluster with several vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)
Exemplo n.º 9
0
 def test_get_term_frequency(self):
     document1 = Document("This document is a document with no duplicates.",
                          preserve_duplicates=False)
     document2 = Document("This document is a document with duplicates.",
                          preserve_duplicates=True)
     document3 = Document(
         "This is just to check that a word is not present.",
         preserve_duplicates=True)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document1), 1)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document2), 2)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document3), 0)
Exemplo n.º 10
0
    def test_set_vectors_none(self):
        """
        Test that setting vectors to ``None`` overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        c.vectors = None
        self.assertEqual([], c.vectors)
        self.assertEqual({}, c.centroid.dimensions)
Exemplo n.º 11
0
 def test_get_word_vector(self):
     documents = [
         Document("Nice document, document", preserve_duplicates=True),
         Document("Bad document"),
         Document("Alright document"),
         Document("Nice day today"),
         Document("No day is a bad day")
     ]
     term_document_matrix = TermDocumentMatrix(documents)
     word_vector = term_document_matrix.word_vectors[
         term_document_matrix.vocabulary.index("document")]
     expected_word_vector = [
         2 * log(5 / 4), 1 * log(5 / 4), 1 * log(5 / 4), 0.0, 0.0
     ]
     self.assertEqual(word_vector, tuple(expected_word_vector))
Exemplo n.º 12
0
    def test_set_one_vectors(self):
        """
        Test that setting vectors to a single vector overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        n = Document("", ['a'], scheme=TF())
        c.vectors = n
        self.assertEqual([n], c.vectors)
        self.assertEqual(n.dimensions, c.centroid.dimensions)
Exemplo n.º 13
0
    def _disambiguate(self, pages):
        """
        Disambiguate a candidate by finding the link that is most similar to the domain.
        The function returns the link's page name and the associated score.
        Only one page is returned: the one with the highest score.

        :param pages: A list of page titles.
        :type pages: list of str

        :return: A tuple containing the most similar page and its similarity score.
        :rtype: tuple
        """
        """
        Get the first section of each page.
        Then, convert them into documents.
        """
        pages = text.collect(pages, introduction_only=True)
        for page, introduction in pages.items():
            pages[page] = Document(introduction,
                                   self.tokenizer.tokenize(introduction),
                                   scheme=self.scheme)
            pages[page].normalize()
        """
        Rank the page scores in descending order.
        Then, choose the best page and return it alongside its score.
        """
        scores = {
            page: vector_math.cosine(introduction, self.domain)
            for page, introduction in pages.items()
        }
        article, score = sorted(scores.items(),
                                key=lambda score: score[1],
                                reverse=True)[0]
        return (article, score)
Exemplo n.º 14
0
    def test_extrapolate_returns_related_participants(self):
        """
        Test that when extrapolating, related participants are returned.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "The LigaPro is the second-highest division of the Portuguese football league system.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
        extrapolator = WikipediaExtrapolator(corpus,
                                             tokenizer,
                                             TF(),
                                             first_level_links=15,
                                             second_level_links=15)
        participants = extrapolator.extrapolate([
            'Associação Académica de Coimbra – O.A.F.',
            'Académico de Viseu F.C.', 'S.L. Benfica B', 'FC Porto B'
        ])

        other_participants = [
            'Casa Pia A.C.', 'G.D. Chaves', 'C.D. Cova da Piedade',
            'S.C. Covilhã', 'G.D. Estoril Praia', 'S.C. Farense',
            'C.D. Feirense', 'Leixões S.C.', 'C.D. Mafra', 'C.D. Nacional',
            'U.D. Oliveirense', 'F.C. Penafiel', 'Varzim S.C.',
            'U.D. Vilafranquense'
        ]
        self.assertGreaterEqual(
            len(set(participants).intersection(set(other_participants))), 4)
Exemplo n.º 15
0
    def test_zero_threshold(self):
        """
        Test that when a threshold of zero is given, all candidate participants are retained.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertTrue('damascus' in scores)
Exemplo n.º 16
0
 def test_get_bag_of_words(self):
     bag_of_words = Document.get_bag_of_words(
         "sample parsed text to split on whitespace")
     expected_bag_of_words = [
         "sample", "parsed", "text", "to", "split", "on", "whitespace"
     ]
     self.assertEqual(bag_of_words, expected_bag_of_words)
Exemplo n.º 17
0
def load_corpus(filename, clean):
    """
    Load the corpus from the given filename.

    :param filename: The path to the corpus from where to detect participants.
    :type filename: str
    :param clean: A boolean indicating whether tweets should be cleaned while loading them.
    :type clean: bool

    :return: A list of :class:`~nlp.document.Document` making up the corpus.
    :rtype: list of :class:`~nlp.document.Document`
    """

    cleaner = TweetCleaner(replace_mentions=True)

    corpus = []
    with open(filename) as f:
        for i, line in enumerate(f):
            tweet = json.loads(line)
            original = tweet
            while "retweeted_status" in tweet:
                tweet = tweet["retweeted_status"]

            if "extended_tweet" in tweet:
                text = tweet["extended_tweet"].get("full_text",
                                                   tweet.get("text", ""))
            else:
                text = tweet.get("text", "")

            text = cleaner.clean(text, original) if clean else text
            document = Document(text)
            corpus.append(document)

    return corpus
Exemplo n.º 18
0
    def test_threshold_filter(self):
        """
        Test the basic functionality of the threshold filter.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0.75)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertFalse('damascus' in scores)
Exemplo n.º 19
0
    def test_extract_from_text(self):
        """
        Test that the entity extractor's named entities do appear in the corresponding tweet.
        """
        """
        Load the corpus.
        """
        filename = os.path.join(os.path.dirname(__file__), '..', '..', '..',
                                '..', 'tests', 'corpora', 'understanding',
                                'CRYCHE-100.json')
        corpus = []
        with open(filename) as f:
            for i, line in enumerate(f):
                tweet = json.loads(line)
                original = tweet
                while "retweeted_status" in tweet:
                    tweet = tweet["retweeted_status"]

                if "extended_tweet" in tweet:
                    text = tweet["extended_tweet"].get("full_text",
                                                       tweet.get("text", ""))
                else:
                    text = tweet.get("text", "")

                document = Document(text)
                corpus.append(document)

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        for (document, candidate_set) in zip(corpus, candidates):
            text = document.text.lower().replace('\n', ' ').replace('  ', ' ')
            self.assertTrue(
                all(candidate in text.lower() for candidate in candidate_set))
Exemplo n.º 20
0
 def test_sorting(self):
     """
     Test that the resolver sorts the tokens in descending order of score.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that the more common candidates are ranked towards the beginning.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)
     self.assertEqual('tottenham', resolved[0])
     self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
Exemplo n.º 21
0
    def test_sorting(self):
        """
        Test that the resolver sorts the tokens in descending order of score.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
            "Tottenham lose again",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores)
        self.assertEqual('tottenham', resolved[0])
        self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
        self.assertEqual(
            set([
                'falter', 'against', 'hotspur', 'unable', 'avoid', 'defeat',
                'lose', 'again'
            ]), set(resolved[3:]))
Exemplo n.º 22
0
    def test_empty_cluster_similarity(self):
        """
        Test that when calculating the similarity between a vector and an empty cluster, the similarity is 0.
        """

        c = Cluster()
        v = Document("", ["a", "c"], scheme=TF())
        self.assertEqual(0, c.similarity(v))
Exemplo n.º 23
0
    def test_centroid_normalized(self):
        """
        Test that the centroid is normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
Exemplo n.º 24
0
    def test_set_several_vectors(self):
        """
        Test that setting vectors to several vectors overwrites existing vectors.
        """

        v = Document("", ['a'], scheme=TF())
        c = Cluster(v)
        self.assertEqual([v], c.vectors)
        self.assertEqual(v.dimensions, c.centroid.dimensions)

        n = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        c.vectors = n
        self.assertEqual(n, c.vectors)
Exemplo n.º 25
0
    def test_intra_similarity_of_cluster_with_single_vector(self):
        """
        Test that the intra-similarity of a cluster with a single vector is equivalent to that vector's similarity with the cluster.
        """

        v = Document("", ['a', 'b'], scheme=TF())
        c = Cluster(v)
        self.assertEqual(c.similarity(v), c.get_intra_similarity())
Exemplo n.º 26
0
 def test_get_ngrams(self):
     ngrams = [2, 3]
     bag_of_words = Document.get_ngrams(
         ngrams, ["hello", "this", "is", "a", "document"])
     expected_bag_of_words = [
         "hello this", "this is", "is a", "a document", "hello this is",
         "this is a", "is a document"
     ]
     self.assertEqual(bag_of_words, expected_bag_of_words)
Exemplo n.º 27
0
    def test_remove_vectors(self):
        """
        Test removing vectors from a cluster gradually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)

        c = Cluster(v)
        c.vectors.remove(v[1])
        self.assertEqual([v[0]], c.vectors)
        c.vectors.remove(v[0])
        self.assertEqual([], c.vectors)
Exemplo n.º 28
0
    def test_add_vectors(self):
        """
        Test adding vectors to a cluster gradually.
        """

        c = Cluster()
        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        self.assertEqual({}, c.centroid.dimensions)

        c.vectors.append(v[0])
        self.assertEqual([v[0]], c.vectors)

        c.vectors.append(v[1])
        self.assertEqual(v, c.vectors)
Exemplo n.º 29
0
 def test_get_inverse_document_frequency(self):
     documents = [
         Document("This document is a document with no duplicates."),
         Document("This document is a document with duplicates."),
         Document("This is just to check that a word is not present.")
     ]
     term_document_matrix = TermDocumentMatrix(documents)
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("document"),
         log(3 / 3))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("not_present"),
         log(3 / 1))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("just"),
         log(3 / 2))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("is"),
         log(3 / 4))
Exemplo n.º 30
0
    def test_cluster_with_several_vectors_copy(self):
        """
        Test that when creating a cluster with several vectors, a copy is created.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)
        copy = list(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)
        self.assertEqual(copy, v)
        self.assertEqual(2, len(v))