def test_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_.data filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] - 2 assert fe.db_.data.shape[0] == db.shape[0] - 2 assert len(fe.filenames_) == len(filenames) - 2 dbn = fe.db_.data assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values, dbn['document_id'].values) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples - 2 fe.delete()
def test_lsi_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2