def test_save_load(): c, docs = mock_corpus() fd, filename = tempfile.mkstemp() dict_fd, dict_filename = tempfile.mkstemp() metadata_fd, metadata_filename = tempfile.mkstemp() try: f = None dict_f = None try: f = os.fdopen(fd, 'wb') dict_f = os.fdopen(dict_fd, 'wb') c.save(documents_file=f, dictionary_file=dict_f, metadata_filename=metadata_filename) finally: if f is not None: f.close() if dict_f is not None: dict_f.close() new_c = Corpus.load( documents_file=filename, dictionary_file=dict_filename, metadata_filename=metadata_filename) assert_equals(c.documents, new_c.documents) assert_true(all(c.metadata == new_c.metadata)) assert_equals(c.dic, new_c.dic) finally: os.remove(filename) os.remove(dict_filename)
def test_metadata(): c, docs = mock_corpus() assert_true(isinstance(c.metadata, pd.DataFrame)) assert_equals(2, len(c.metadata)) assert_equals('this', c.metadata['user'][0]) assert_equals(10, c.metadata['age'][0]) assert_true(np.isnan(c.metadata['age'][1]))
def test_add(): c, docs = mock_corpus() old_samples = c.num_samples old_features = c.num_features c.add_document(['new', 'words']) assert_equals(old_samples + 1, c.num_samples) assert_equals(old_features + 2, c.num_features)
def test_index(): c, docs = mock_corpus() newc = c.with_index(1) assert_equals(2, c.num_samples) assert_equals(1, newc.num_samples) # dictionary does not change assert_equals(3, c.num_features) assert_equals(3, newc.num_features) assert_array_equal(docs[1], newc.documents[0])
def test_merge(): c, docs = mock_corpus() otherc, otherdocs = mock_corpus() old_metadata = c.metadata.fillna(-1.0) c.merge(otherc) assert_equals(4, c.num_samples) assert_array_equal(docs + docs, c.documents) assert_equals(3, c.num_features) assert_equals((4, 3), c.sparse_matrix().shape) assert_equals(4, len(c.metadata)) new_metadata = c.metadata[:2].fillna(-1.0) assert_array_equal(old_metadata, new_metadata) other_metadata = c.metadata[2:4].fillna(-1.0) # reset index to align with old metadata other_metadata.index = [0, 1] assert_array_equal(old_metadata, other_metadata)
def test_init_nonempty(): c, docs = mock_corpus() assert_equals(docs, c.documents) assert_equals(gensim.corpora.Dictionary(docs), c.dic) assert_equals(3, c.num_features) assert_equals(2, c.num_samples) words = [] for i in range(c.num_features): words.append(c.word(i)) assert_equals(frozenset(words), frozenset(['a', 'la', 'ca']))
def test_sparse_matrix(): c, docs = mock_corpus() matrix = c.sparse_matrix() assert_equals((2, 3), matrix.shape) assert_equals(2, matrix[0, 0]) assert_equals(0, matrix[1, 0]) assert_equals(1, matrix[1, 2]) c.add_document([]) matrix = c.sparse_matrix() assert_equals((3, 3), matrix.shape) assert_equals(2, matrix[0, 0]) assert_equals(0, matrix[1, 0]) assert_equals(1, matrix[1, 2])
def test_save_load_dictionary(): c, docs = mock_corpus() dict_fd, dict_filename = tempfile.mkstemp() try: dict_f = None try: dict_f = os.fdopen(dict_fd, 'wb') c.save_dictionary(dict_f) finally: if dict_f is not None: dict_f.close() new_c = Corpus() new_c.load_dictionary(dict_filename) assert_equals(c.dic, new_c.dic) finally: os.remove(dict_filename)
def test_mask(): c, docs = mock_corpus() newc = c.with_mask([True, False]) assert_equals(2, c.num_samples) assert_equals(1, newc.num_samples) # dictionary does not change assert_equals(3, c.num_features) assert_equals(3, newc.num_features) assert_array_equal(docs[0], newc.documents[0]) newc = c.with_mask([False, True]) assert_equals(1, newc.num_samples) assert_array_equal(docs[1], newc.documents[0]) newc = c.with_mask([True, True]) assert_equals(2, newc.num_samples) newc = c.with_mask([False, False]) assert_equals(0, newc.num_samples) # dictionary does not change assert_equals(3, newc.num_features)