def test_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_ filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] * 2 assert fe.db_.data.shape[0] == db.data.shape[0] * 2 assert len(fe.filenames_) == len(filenames) * 2 dbn = fe.db_.data assert_equal(dbn.iloc[:n_samples]['document_id'].values, dbn.iloc[n_samples:]['document_id'].values - 10) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples * 2 fe.delete()
def test_lsi_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() n_samples = fe.n_samples_ docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0]*2 assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])