Exemplos de Document em Python, exemplos de nlp.document.Document em Python

Exemplo n.º 1

0

Exibir arquivo

    def test_cluster_with_one_vector(self):
        """
        Test that the centroid of a cluster with a single vector has an equivalent centroid.
        """

        v = Document("a", ["a", "b", "a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertEqual(v.dimensions, c.centroid.dimensions)

Exemplo n.º 2

0

Exibir arquivo

    def test_centroid_normalized_several_vectors(self):
        """
        Test that the centroid is always normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
        c.vectors.append(Document("", ["a", "b", "a", "d"]))
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))

Exemplo n.º 3

0

Exibir arquivo

    def test_size(self):
        """
        Test retrieving the size of a cluster.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(len(v), c.size())

Exemplo n.º 4

0

Exibir arquivo

    def test_intra_similarity_of_cluster(self):
        """
        Test that the intra-similarity of a cluster with several vectors is equivalent to the average similarity.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual((c.similarity(v[0]) + c.similarity(v[1])) / 2.,
                         c.get_intra_similarity())

Exemplo n.º 5

0

Exibir arquivo

    def test_get_representative_vectors(self):
        """
        Test ranking the vectors according to their similarity to the cluster.
        """

        v = [
            Document("", ['a', 'b', 'c'], scheme=TF()),
            Document("", ['a', 'a', 'c'], scheme=TF()),
            Document("", ['p'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(list, type(c.get_representative_vectors(2)))
        self.assertEqual([v[1], v[0]], c.get_representative_vectors(2))

Exemplo n.º 6

0

Exibir arquivo

    def test_get_centroid(self):
        """
        Test getting the centroid.
        """

        v = Document("", ["a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertTrue(
            all(
                round(v.dimensions[dimension], 10) == round(
                    c.centroid.dimensions[dimension], 10)
                for dimension in v.dimensions.keys() | c.centroid.dimensions))

Exemplo n.º 7

0

Exibir arquivo

    def test_setting_vectors(self):
        """
        Test setting the vectors manually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster()
        self.assertEqual({}, c.centroid.dimensions)
        c.vectors = v
        self.assertEqual(v, c.vectors)

Exemplo n.º 8

0

Exibir arquivo

    def test_cluster_with_several_vectors(self):
        """
        Test creating a cluster with several vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_term_document_matrix.py Projeto: mcassia/nlp

 def test_get_term_frequency(self):
     document1 = Document("This document is a document with no duplicates.",
                          preserve_duplicates=False)
     document2 = Document("This document is a document with duplicates.",
                          preserve_duplicates=True)
     document3 = Document(
         "This is just to check that a word is not present.",
         preserve_duplicates=True)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document1), 1)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document2), 2)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document3), 0)

Exemplo n.º 10

0

Exibir arquivo

    def test_set_vectors_none(self):
        """
        Test that setting vectors to ``None`` overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        c.vectors = None
        self.assertEqual([], c.vectors)
        self.assertEqual({}, c.centroid.dimensions)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_term_document_matrix.py Projeto: mcassia/nlp

 def test_get_word_vector(self):
     documents = [
         Document("Nice document, document", preserve_duplicates=True),
         Document("Bad document"),
         Document("Alright document"),
         Document("Nice day today"),
         Document("No day is a bad day")
     ]
     term_document_matrix = TermDocumentMatrix(documents)
     word_vector = term_document_matrix.word_vectors[
         term_document_matrix.vocabulary.index("document")]
     expected_word_vector = [
         2 * log(5 / 4), 1 * log(5 / 4), 1 * log(5 / 4), 0.0, 0.0
     ]
     self.assertEqual(word_vector, tuple(expected_word_vector))

Exemplo n.º 12

0

Exibir arquivo

    def test_set_one_vectors(self):
        """
        Test that setting vectors to a single vector overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        n = Document("", ['a'], scheme=TF())
        c.vectors = n
        self.assertEqual([n], c.vectors)
        self.assertEqual(n.dimensions, c.centroid.dimensions)

Exemplo n.º 13

0

Exibir arquivo

    def _disambiguate(self, pages):
        """
        Disambiguate a candidate by finding the link that is most similar to the domain.
        The function returns the link's page name and the associated score.
        Only one page is returned: the one with the highest score.

        :param pages: A list of page titles.
        :type pages: list of str

        :return: A tuple containing the most similar page and its similarity score.
        :rtype: tuple
        """
        """
        Get the first section of each page.
        Then, convert them into documents.
        """
        pages = text.collect(pages, introduction_only=True)
        for page, introduction in pages.items():
            pages[page] = Document(introduction,
                                   self.tokenizer.tokenize(introduction),
                                   scheme=self.scheme)
            pages[page].normalize()
        """
        Rank the page scores in descending order.
        Then, choose the best page and return it alongside its score.
        """
        scores = {
            page: vector_math.cosine(introduction, self.domain)
            for page, introduction in pages.items()
        }
        article, score = sorted(scores.items(),
                                key=lambda score: score[1],
                                reverse=True)[0]
        return (article, score)

Exemplo n.º 14

0

Exibir arquivo

    def test_extrapolate_returns_related_participants(self):
        """
        Test that when extrapolating, related participants are returned.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "The LigaPro is the second-highest division of the Portuguese football league system.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
        extrapolator = WikipediaExtrapolator(corpus,
                                             tokenizer,
                                             TF(),
                                             first_level_links=15,
                                             second_level_links=15)
        participants = extrapolator.extrapolate([
            'Associação Académica de Coimbra – O.A.F.',
            'Académico de Viseu F.C.', 'S.L. Benfica B', 'FC Porto B'
        ])

        other_participants = [
            'Casa Pia A.C.', 'G.D. Chaves', 'C.D. Cova da Piedade',
            'S.C. Covilhã', 'G.D. Estoril Praia', 'S.C. Farense',
            'C.D. Feirense', 'Leixões S.C.', 'C.D. Mafra', 'C.D. Nacional',
            'U.D. Oliveirense', 'F.C. Penafiel', 'Varzim S.C.',
            'U.D. Vilafranquense'
        ]
        self.assertGreaterEqual(
            len(set(participants).intersection(set(other_participants))), 4)

Exemplo n.º 15

0

Exibir arquivo

    def test_zero_threshold(self):
        """
        Test that when a threshold of zero is given, all candidate participants are retained.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertTrue('damascus' in scores)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_document.py Projeto: mcassia/nlp

 def test_get_bag_of_words(self):
     bag_of_words = Document.get_bag_of_words(
         "sample parsed text to split on whitespace")
     expected_bag_of_words = [
         "sample", "parsed", "text", "to", "split", "on", "whitespace"
     ]
     self.assertEqual(bag_of_words, expected_bag_of_words)

Exemplo n.º 17

0

Exibir arquivo

def load_corpus(filename, clean):
    """
    Load the corpus from the given filename.

    :param filename: The path to the corpus from where to detect participants.
    :type filename: str
    :param clean: A boolean indicating whether tweets should be cleaned while loading them.
    :type clean: bool

    :return: A list of :class:`~nlp.document.Document` making up the corpus.
    :rtype: list of :class:`~nlp.document.Document`
    """

    cleaner = TweetCleaner(replace_mentions=True)

    corpus = []
    with open(filename) as f:
        for i, line in enumerate(f):
            tweet = json.loads(line)
            original = tweet
            while "retweeted_status" in tweet:
                tweet = tweet["retweeted_status"]

            if "extended_tweet" in tweet:
                text = tweet["extended_tweet"].get("full_text",
                                                   tweet.get("text", ""))
            else:
                text = tweet.get("text", "")

            text = cleaner.clean(text, original) if clean else text
            document = Document(text)
            corpus.append(document)

    return corpus

Exemplo n.º 18

0

Exibir arquivo

    def test_threshold_filter(self):
        """
        Test the basic functionality of the threshold filter.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0.75)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertFalse('damascus' in scores)

Exemplo n.º 19

0

Exibir arquivo

    def test_extract_from_text(self):
        """
        Test that the entity extractor's named entities do appear in the corresponding tweet.
        """
        """
        Load the corpus.
        """
        filename = os.path.join(os.path.dirname(__file__), '..', '..', '..',
                                '..', 'tests', 'corpora', 'understanding',
                                'CRYCHE-100.json')
        corpus = []
        with open(filename) as f:
            for i, line in enumerate(f):
                tweet = json.loads(line)
                original = tweet
                while "retweeted_status" in tweet:
                    tweet = tweet["retweeted_status"]

                if "extended_tweet" in tweet:
                    text = tweet["extended_tweet"].get("full_text",
                                                       tweet.get("text", ""))
                else:
                    text = tweet.get("text", "")

                document = Document(text)
                corpus.append(document)

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        for (document, candidate_set) in zip(corpus, candidates):
            text = document.text.lower().replace('\n', ' ').replace('  ', ' ')
            self.assertTrue(
                all(candidate in text.lower() for candidate in candidate_set))

Exemplo n.º 20

0

Exibir arquivo

 def test_sorting(self):
     """
     Test that the resolver sorts the tokens in descending order of score.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that the more common candidates are ranked towards the beginning.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)
     self.assertEqual('tottenham', resolved[0])
     self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))

Exemplo n.º 21

0

Exibir arquivo

    def test_sorting(self):
        """
        Test that the resolver sorts the tokens in descending order of score.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
            "Tottenham lose again",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores)
        self.assertEqual('tottenham', resolved[0])
        self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
        self.assertEqual(
            set([
                'falter', 'against', 'hotspur', 'unable', 'avoid', 'defeat',
                'lose', 'again'
            ]), set(resolved[3:]))

Exemplo n.º 22

0

Exibir arquivo

    def test_empty_cluster_similarity(self):
        """
        Test that when calculating the similarity between a vector and an empty cluster, the similarity is 0.
        """

        c = Cluster()
        v = Document("", ["a", "c"], scheme=TF())
        self.assertEqual(0, c.similarity(v))

Exemplo n.º 23

0

Exibir arquivo

    def test_centroid_normalized(self):
        """
        Test that the centroid is normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))

Exemplo n.º 24

0

Exibir arquivo

    def test_set_several_vectors(self):
        """
        Test that setting vectors to several vectors overwrites existing vectors.
        """

        v = Document("", ['a'], scheme=TF())
        c = Cluster(v)
        self.assertEqual([v], c.vectors)
        self.assertEqual(v.dimensions, c.centroid.dimensions)

        n = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        c.vectors = n
        self.assertEqual(n, c.vectors)

Exemplo n.º 25

0

Exibir arquivo

    def test_intra_similarity_of_cluster_with_single_vector(self):
        """
        Test that the intra-similarity of a cluster with a single vector is equivalent to that vector's similarity with the cluster.
        """

        v = Document("", ['a', 'b'], scheme=TF())
        c = Cluster(v)
        self.assertEqual(c.similarity(v), c.get_intra_similarity())

Exemplo n.º 26

0

Exibir arquivo

Arquivo: test_document.py Projeto: mcassia/nlp

 def test_get_ngrams(self):
     ngrams = [2, 3]
     bag_of_words = Document.get_ngrams(
         ngrams, ["hello", "this", "is", "a", "document"])
     expected_bag_of_words = [
         "hello this", "this is", "is a", "a document", "hello this is",
         "this is a", "is a document"
     ]
     self.assertEqual(bag_of_words, expected_bag_of_words)

Exemplo n.º 27

0

Exibir arquivo

    def test_remove_vectors(self):
        """
        Test removing vectors from a cluster gradually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)

        c = Cluster(v)
        c.vectors.remove(v[1])
        self.assertEqual([v[0]], c.vectors)
        c.vectors.remove(v[0])
        self.assertEqual([], c.vectors)

Exemplo n.º 28

0

Exibir arquivo

    def test_add_vectors(self):
        """
        Test adding vectors to a cluster gradually.
        """

        c = Cluster()
        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        self.assertEqual({}, c.centroid.dimensions)

        c.vectors.append(v[0])
        self.assertEqual([v[0]], c.vectors)

        c.vectors.append(v[1])
        self.assertEqual(v, c.vectors)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: test_term_document_matrix.py Projeto: mcassia/nlp

 def test_get_inverse_document_frequency(self):
     documents = [
         Document("This document is a document with no duplicates."),
         Document("This document is a document with duplicates."),
         Document("This is just to check that a word is not present.")
     ]
     term_document_matrix = TermDocumentMatrix(documents)
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("document"),
         log(3 / 3))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("not_present"),
         log(3 / 1))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("just"),
         log(3 / 2))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("is"),
         log(3 / 4))

Exemplo n.º 30

0

Exibir arquivo

    def test_cluster_with_several_vectors_copy(self):
        """
        Test that when creating a cluster with several vectors, a copy is created.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)
        copy = list(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)
        self.assertEqual(copy, v)
        self.assertEqual(2, len(v))