示例#1
0
    def test_asserting_errors(self):
        c = Corpus.from_file('book-excerpts')

        too_large_x = np.vstack((c.X, c.X))
        with self.assertRaises(ValueError):
            Corpus.from_numpy(c.domain, too_large_x, c.Y, c.metas, c.W,
                              c.text_features)

        with self.assertRaises(ValueError):
            c.set_text_features([StringVariable('foobar')])

        with self.assertRaises(ValueError):
            c.set_text_features([c.domain.metas[0], c.domain.metas[0]])
示例#2
0
def main():
    from Orange.data import Table, Domain, ContinuousVariable, StringVariable

    words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home'
    words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T
    weights = np.random.random((len(words), 1))

    data = np.zeros((len(words), 0))
    metas = []
    for i, w in enumerate(weights.T):
        data = np.column_stack((data, words, w))
        metas = metas + [StringVariable('Topic' + str(i)),
                         ContinuousVariable('weights')]
    domain = Domain([], metas=metas)
    table = Table.from_numpy(domain,
                             X=np.zeros((len(words), 0)),
                             metas=data)
    app = QtGui.QApplication([''])
    w = OWWordCloud()
    w.on_topics_change(table)
    domain = Domain([], metas=[StringVariable('text')])
    data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]]))
    w.on_corpus_change(data)
    w.show()
    app.exec()
示例#3
0
def main():
    from Orange.data import Table, Domain, ContinuousVariable, StringVariable

    words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home'
    words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T
    weights = np.random.random((len(words), 1))

    data = np.zeros((len(words), 0))
    metas = []
    for i, w in enumerate(weights.T):
        data = np.column_stack((data, words, w))
        metas = metas + [StringVariable('Topic' + str(i)),
                         ContinuousVariable('weights')]
    domain = Domain([], metas=metas)
    table = Table.from_numpy(domain,
                             X=np.zeros((len(words), 0)),
                             metas=data)
    app = QtGui.QApplication([''])
    w = OWWordCloud()
    w.on_topics_change(table)
    domain = Domain([], metas=[StringVariable('text')])
    data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]]))
    w.on_corpus_change(data)
    w.show()
    app.exec()
示例#4
0
    def test_set_title_from_domain(self):
        """
        When we setup domain from data (e.g. from_numpy) _title variable
        must be set.
        """
        domain = Domain([], metas=[StringVariable("title"), StringVariable("a")])
        metas = [["title1", "a"], ["title2", "b"]]

        corpus = Corpus.from_numpy(
            domain, X=np.empty((2, 0)), metas=np.array(metas)
        )
        assert_array_equal(["Document 1", "Document 2"], corpus.titles)

        domain["title"].attributes["title"] = True
        corpus = Corpus.from_numpy(
            domain, X=np.empty((2, 0)), metas=np.array(metas)
        )
        assert_array_equal(["title1", "title2"], corpus.titles)
示例#5
0
    def test_titles_from_rows(self):
        domain = Domain([],
                        metas=[StringVariable("title"), StringVariable("a")])
        metas = [["title1", "a"], ["title2", "b"], ["titles3", "c"]]

        corpus = Corpus.from_numpy(
            domain, X=np.empty((3, 0)), metas=np.array(metas)
        )
        corpus = Corpus.from_table_rows(corpus, [0, 2])
        assert_array_equal(["Document 1", "Document 3"], corpus.titles)
示例#6
0
    def test_init_preserve_shape_of_empty_x(self):
        c = Corpus.from_file('book-excerpts')
        d = c.domain
        new_domain = Domain((ContinuousVariable('c1'), ), d.class_vars,
                            d.metas)

        empty_X = csr_matrix((len(c), 1))
        new = Corpus.from_numpy(new_domain, X=empty_X, Y=c.Y, metas=c.metas)

        self.assertEqual(empty_X.nnz, 0)
        self.assertEqual(new.X.shape, empty_X.shape)
示例#7
0
 def test_corpus_from_numpy(self):
     domain = Domain([],
                     metas=[StringVariable("title"),
                            StringVariable("a")])
     corpus = Corpus.from_numpy(domain,
                                np.empty((2, 0)),
                                metas=np.array([["title1", "a"],
                                                ["title2", "b"]]))
     self.assertEqual(2, len(corpus))
     assert_array_equal(["Document 1", "Document 2"], corpus.titles)
     self.assertListEqual([StringVariable("title")], corpus.text_features)
     self.assertIsNone(corpus._tokens)
     self.assertListEqual([], corpus.used_preprocessor.preprocessors)
示例#8
0
 def _create_corpus(self) -> Corpus:
     corpus = None
     names = ["name", "path", "content"] if not self.is_conllu else [
         "name", "path", "utterance", "content"
     ]
     data = []
     category_data = []
     text_categories = list(set(t.category for t in self._text_data))
     values = list(set(text_categories))
     category_var = DiscreteVariable.make("category", values=values)
     for textdata in self._text_data:
         datum = [
             # some characters are written as decomposed (č is char c
             # and separate char for caron), with NFC normalization we
             # normalize them to be written as precomposed (č is one
             # unicode char - 0x10D)
             # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
             normalize('NFC', textdata.name),
             normalize('NFC', textdata.path),
             normalize('NFC', textdata.content)
         ]
         if self.is_conllu:
             datum.insert(2, normalize('NFC', textdata.doc_id))
         data.append(datum)
         category_data.append(category_var.to_val(textdata.category))
     if len(text_categories) > 1:
         category_data = np.array(category_data)
     else:
         category_var = []
         category_data = np.empty((len(data), 0))
     domain = Domain([], category_var,
                     [StringVariable.make(name) for name in names])
     domain["name"].attributes["title"] = True
     data = np.array(data, dtype=object)
     if len(data):
         corpus = Corpus.from_numpy(domain,
                                    X=np.empty((len(category_data), 0)),
                                    Y=category_data,
                                    metas=data,
                                    text_features=[domain.metas[-1]])
     return corpus
示例#9
0
def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_vars = []
    time_var = None
    for field_name, _ in includes_metadata:
        if field_name == PUBMED_FIELD_DATE:
            time_var = TimeVariable(field_name)
            meta_vars.append(time_var)
        else:
            meta_vars.append(StringVariable.make(field_name))
            if field_name == PUBMED_FIELD_TITLE:
                meta_vars[-1].attributes["title"] = True

    meta_values, class_values = _records_to_corpus_entries(
        records,
        includes_metadata=includes_metadata,
        time_var=time_var,
    )

    class_vars = [
        DiscreteVariable('section',
                         values=list(map(str, set(filter(None,
                                                         class_values)))))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus.from_numpy(domain=domain,
                             X=np.empty((len(Y), 0)),
                             Y=Y,
                             metas=meta_values)
示例#10
0
 def get_data(self):
     domain = Domain([],
                     metas=[
                         StringVariable("Conc. {}".format(self.word)),
                         StringVariable("Document")
                     ])
     data = []
     docs = []
     for row in range(self.rowCount()):
         txt = []
         for column in range(self.columnCount()):
             index = self.index(row, column)
             txt.append(str(self.data(index)))
         data.append([" ".join(txt)])
         docs.append([self.corpus.titles[self.word_index[row][0]]])
     conc = (np.array(np.hstack(
         (data, docs)), dtype=object) if data else np.empty((0, 2)))
     return Corpus.from_numpy(
         domain,
         X=np.empty((len(conc), 0)),
         metas=conc,
         text_features=[domain.metas[0]],
     )
示例#11
0
"""
input: Corpus preprocessed with Preprocess Text. Tokenizer is set to Sentences.
output: Corpus where sentences are now documents.
requires: Text add-on
"""

import numpy as np
from Orange.data import Domain, StringVariable
from orangecontrib.text.corpus import Corpus

tokens = in_data.tokens
title = [i for i in in_data.domain.metas if "title" in i.attributes][0]
new_domain = Domain(attributes=[], metas=[StringVariable('Sentences'),
                                          title)

titles = []
content = []


for i, doc in enumerate(tokens):
    for t in doc:
        titles.append(in_data[i][title.name].value)
        content.append(t)

metas = np.column_stack((content, titles))
out_data = Corpus.from_numpy(domain=new_domain, X=np.empty((len(content), 0)),
                             metas=metas)
out_data.set_text_features([StringVariable('Sentences')])
out_data.set_title_variable(title)