def test_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_.data filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] - 2 assert fe.db_.data.shape[0] == db.shape[0] - 2 assert len(fe.filenames_) == len(filenames) - 2 dbn = fe.db_.data assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values, dbn['document_id'].values) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples - 2 fe.delete()
def test_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_ filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] * 2 assert fe.db_.data.shape[0] == db.data.shape[0] * 2 assert len(fe.filenames_) == len(filenames) * 2 dbn = fe.db_.data assert_equal(dbn.iloc[:n_samples]['document_id'].values, dbn.iloc[n_samples:]['document_id'].values - 10) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples * 2 fe.delete()
def test_ingestion_pickling(): from sklearn.externals import joblib db = DocumentIndex.from_folder(data_dir) fname = os.path.join(cache_dir, 'document_index') # check that db is picklable joblib.dump(db, fname) db2 = joblib.load(fname) os.remove(fname)
def test_search_not_found(): dbi = DocumentIndex.from_folder(data_dir) query = pd.DataFrame([{ 'file_path': "DOES_NOT_EXISTS" }, { 'file_path': "0.7.6.28637.txt" }]) with pytest.raises(NotFound): sres = dbi.search(query)
def test_ingestion_base_dir(): dbi = DocumentIndex.from_folder(data_dir) dbi._make_relative_paths() data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data assert data_dir_res == os.path.normpath(data_dir) assert_array_equal(db.columns.values, ['file_path', 'internal_id', 'document_id']) assert_array_equal(db.file_path.values, fnames_in) assert_array_equal( [os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames], [os.path.join(data_dir_res, el) for el in db.file_path.values])
def test_search_2fields(): dbi = DocumentIndex.from_folder(data_dir) dbi._make_relative_paths() query = pd.DataFrame([{ 'internal_id': 3 }, { 'internal_id': 1 }, { 'internal_id': 2 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [3, 1, 2]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) # make sure that if we have some additional field, # we still use the internal_id query = pd.DataFrame([{ 'internal_id': 1, 'a': 2 }, { 'internal_id': 2, 'b': 4 }, { 'internal_id': 1, 'a': 3 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) sres = dbi.search(query, drop=False) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal( sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id', 'a', 'b'])) query = pd.DataFrame([{ 'file_path': "0.7.6.28637.txt" }, { 'file_path': "0.7.47.117435.txt" }]) del dbi.data['file_path'] sres = dbi.search(query) query_res = [ dbi.data.file_path.values.tolist().index(el) for el in query.file_path.values ] assert_array_equal(query_res, sres.internal_id)
def test_lsi_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
def test_lsi_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() n_samples = fe.n_samples_ docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0]*2 assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
def ingest(self, data_dir=None, file_pattern='.*', dir_pattern='.*', dataset_definition=None, vectorize=True, document_id_generator='indexed_file_path', ): """Perform data ingestion Parameters ---------- data_dir : str path to the data directory (used only if metadata not provided), default: None dataset_defintion : list of dicts a list of dictionaries with keys ['file_path', 'document_id', 'rendition_id'] describing the data ingestion (this overwrites data_dir) vectorize : bool (default: True) """ dsid_dir = self.cache_dir / self.dsid if (dsid_dir / 'db').exists(): raise ValueError('Dataset {} already vectorized!' .format(self.dsid)) db_list = list(sorted(dsid_dir.glob('db*'))) if len(db_list) == 0: internal_id_offset = -1 elif len(db_list) >= 1: internal_id_offset = int(db_list[-1].name[3:]) pars = self.pars_ if pars.get('column_ids', None) is not None: if dataset_definition is None: raise ValueError("CSV files can only be privided using " "`dataset_definition` parameter") else: if len(dataset_definition) > 1: raise ValueError( "Only one CSV can be provided at a time" ) file_path = dataset_definition[0]['file_path'] X = pd.read_csv( file_path, sep=pars['column_separator'], header=None) dataset_definition = [ {'file_path': f"{file_path}:{idx}", 'document_id': idx} for idx in range(len(X))] db = DocumentIndex.from_list( dataset_definition, data_dir, internal_id_offset + 1, dsid_dir, document_id_generator=document_id_generator) elif dataset_definition is not None: db = DocumentIndex.from_list( dataset_definition, data_dir, internal_id_offset + 1, dsid_dir, document_id_generator=document_id_generator) elif data_dir is not None: db = DocumentIndex.from_folder( data_dir, file_pattern, dir_pattern, internal_id_offset + 1, document_id_generator=document_id_generator) else: db = None if db is not None: data_dir = db.data_dir batch_suffix = '.{:09}'.format(db.data.internal_id.iloc[-1]) self._filenames = db.data.file_path.values.tolist() del db.data['file_path'] if 'file_path' in db.data.columns: del db.data['file_path'] db.data.to_pickle(str(dsid_dir / ('db' + batch_suffix))) with (dsid_dir / ('filenames' + batch_suffix)).open('wb') as fh: pickle.dump(self._filenames, fh) self._db = db if vectorize: db_list = list(sorted(dsid_dir.glob('db*'))) filenames_list = list(sorted(dsid_dir.glob('filenames*'))) if len(db_list) == 0: raise ValueError('No ingested files found!') if len(db_list) == 1: with filenames_list[0].open('rb') as fh: filenames_concat = pickle.load(fh) elif len(db_list) >= 2: # accumulate different batches into a single file # filename file filenames_concat = [] for fname in filenames_list: with fname.open('rb') as fh: filenames_concat += pickle.load(fh) if self.pars_['data_dir'] is None: data_dir = DocumentIndex._detect_data_dir(filenames_concat) self._pars['data_dir'] = data_dir else: data_dir = self._pars['data_dir'] self._filenames = [os.path.relpath(el, data_dir) for el in filenames_concat] with (dsid_dir / 'filenames').open('wb') as fh: pickle.dump(self._filenames, fh) for fname in filenames_list: fname.unlink() # save databases if len(db_list) == 1: db_list[0].rename(dsid_dir / 'db') self.db_.filenames_ = self._filenames self.db_.data['file_path'] = self._filenames elif len(db_list) >= 2: db_concat = [] for fname in db_list: db_concat.append(pd.read_pickle(str(fname))) db_new = pd.concat(db_concat, axis=0) db_new.filenames_ = self._filenames db_new.set_index('internal_id', drop=False, inplace=True) self._db = DocumentIndex(data_dir, db_new) if 'file_path' in db_new.columns: del db_new['file_path'] db_new.to_pickle(str(dsid_dir / 'db')) # save parameters self._pars['n_samples'] = len(self._filenames) self._pars['data_dir'] = data_dir with (dsid_dir / 'pars').open('wb') as fh: pickle.dump(self._pars, fh) self.transform() if (dsid_dir / 'raw').exists(): shutil.rmtree(str(dsid_dir / 'raw')) if db is None and not vectorize: raise ValueError('At least one of data_dir, dataset_definition, ' 'vectorize parameters must be provided!') return
def load_dataset(name='20_newsgroups_3categories', cache_dir='/tmp', verbose=False, verify_checksum=False, document_id_generation='squared', categories=None): """Download a benchmark dataset. The currently supported datasets are listed below, 1. TREC 2009 legal collection - `treclegal09_2k_subset` : 2 400 documents, 2 MB - `treclegal09_20k_subset` : 20 000 documents, 30 MB - `treclegal09_37k_subset` : 37 000 documents, 55 MB - `treclegal09` : 700 000 documents, 1.2 GB The ground truth files for categorization are adapted from TAR Toolkit. 2. Fedora mailing list (2009-2009) - `fedora_ml_3k_subset` 3. The 20 newsgoups dataset - `20_newsgroups_3categories`: only the ['comp.graphics', 'rec.sport.baseball', 'sci.space'] categories If you encounter any issues for downloads with this function, you can also manually download and extract the required dataset to ``cache_dir`` (the download url is ``http://r0h.eu/d/<name>.tar.gz``), then re-run this function to get the required metadata. Parameters ---------- name : str, default='20_newsgroups_3categories' the name of the dataset file to load cache_dir : str, default='/tmp/' root directory where to save the download verbose : bool, default=False print download progress verify_checksum : bool, default=False verify the checksum of the downloaded archive document_id_generation : str specifies how the document_id is computed from internal_id must be one of ``['identity', 'squared']`` ``default="identity"`` (i.e. ``document_id = internal_id``) categories : str select a subsection of the dataset, ``default='all'`` Returns ------- metadata: dict a dictionary containing metadata corresponding to the dataset training_set : {dict, None} a list of dictionaries for the training set test_set : dict a list of dictionaries for the test set """ from freediscovery.engine.ingestion import DocumentIndex from freediscovery.io import parse_ground_truth_file if name not in IR_DATASETS: raise ValueError('Dataset name {} not known!'.format(name)) valid_fields = ['document_id', 'internal_id', 'file_path', 'category'] has_categories = '20_newsgroups_' in name or 'treclegal09' in name # make sure we don't have "ediscovery_cache" in the path cache_dir = _normalize_cachedir(cache_dir) cache_dir = os.path.dirname(cache_dir) outdir = os.path.join(cache_dir, name) fname = outdir db = IR_DATASETS[name] if '20_newsgroups_' in name: if db['url'].endswith('.pkl.xz'): import lzma fname = name + '.pkl.xz' opener = lzma.open else: fname = name + '.pkl' opener = open with opener(os.path.join(INTERNAL_DATA_DIR, fname), 'rb') as fh: twenty_news = pickle.load(fh) # Download the dataset if it doesn't exist if not os.path.exists(outdir): if '20_newsgroups_' in name: os.mkdir(outdir) for idx, doc in enumerate(twenty_news.data): with open(os.path.join(outdir, '{:05}.txt'.format(idx)), 'wt') as fh: # noqa fh.write(doc) else: outdir = _get_file(str(fname), db['url'], extract=True, cache_dir=str(cache_dir)) print('Downloaded {} dataset to {}'.format(name, outdir)) if 'treclegal09' in name or 'fedora_ml' in name: data_dir = os.path.join(outdir, 'data') else: data_dir = outdir md = {'data_dir': str(data_dir), 'name': name} di = DocumentIndex.from_folder(str(data_dir)) di._make_relative_paths() training_set = None if 'treclegal09' in name: negative_files, positive_files = _load_erdm_ground_truth(outdir) ground_truth_file = os.path.join(outdir, "ground_truth_file.txt") gt = parse_ground_truth_file(str(ground_truth_file)) res = di.search(gt, drop=False) di.data['category'] = res.is_relevant di.data['category'] = di.data['category'].apply( lambda x: 'positive' if x == 1 else 'negative') di.data['is_train'] = False res = di.search( pd.DataFrame({'file_path': positive_files + negative_files})) di.data.loc[res.internal_id.values, 'is_train'] = True elif '20_newsgroups_' in name: di.data['category'] = np.array( twenty_news.target_names)[twenty_news.target] # noqa di.data['is_train'] = ['-train' in el for el in twenty_news.filenames] if categories is not None and has_categories: mask = di.data.category.isin(categories) di.data = di.data[mask] di.data['internal_id'] = np.arange(len(di.data['internal_id'])) di.data.set_index('internal_id', drop=False, inplace=True) di.data['document_id'] = _compute_document_id(di.data['internal_id'], document_id_generation) di.data = di.data.astype('object') if has_categories: mask = di.data['is_train'] training_set = di.render_dict(di.data[mask], return_file_path=True) training_set = filter_dict(training_set, valid_fields) if name == '20_newsgroups_3categories': # make a smaller training set random.seed(999998) training_set = random.sample( training_set, min(len(training_set), di.data.shape[0] // 5)) dataset = di.render_dict(return_file_path=True) dataset = filter_dict(dataset, valid_fields) return md, training_set, dataset