Python get_docs_categories_semiotic 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scattertext.test.test_semioticSquare

메소드/함수: get_docs_categories_semiotic

hotexamples.com에서의 예제들: 5

Python get_docs_categories_semiotic - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scattertext.test.test_semioticSquare.get_docs_categories_semiotic에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test_TermDocMat.py 프로젝트: xuezhizeng/scattertext

	def test_keep_only_these_categories(self):
		df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
		                  columns=['category', 'text'])
		corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
		hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
		self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
		self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
		with self.assertRaises(AssertionError):
			corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
		corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)

예제 #2

파일 보기

파일: test_TermDocMat.py 프로젝트: JasonKessler/scattertext

 def test_keep_only_these_categories(self):
     df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                       columns=['category', 'text'])
     corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
     hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
     self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
     self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
     with self.assertRaises(AssertionError):
         corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
     corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)

예제 #3

파일 보기

파일: test_termDocMatrixFromScikit.py 프로젝트: JasonKessler/scattertext

	def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])

예제 #4

파일 보기

    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text',
                                  nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal([
            i for i in corpus._y if i != corpus.get_categories().index('swift')
        ], swiftless._y)
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())

예제 #5

파일 보기

파일: test_TermDocMat.py 프로젝트: JasonKessler/scattertext

    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal(
            [i for i in corpus._y if i != corpus.get_categories().index('swift')],
            swiftless._y
        )
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())