def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() np.random.seed(1) data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) dsid = fe.setup(n_features=n_features, use_hashing=False, stop_words='english', min_df=0.1, max_df=0.9) fe.ingest(data_dir, file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid) lsi.fit_transform(n_components=6) return cache_dir, dsid, fe.filenames_, lsi.mid
def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() np.random.seed(1) data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) dsid = fe.preprocess( data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=False, stop_words='english', min_df=0.1, max_df=0.9 ) # TODO unused variable 'uuid' (overwritten on the next line) dsid, filenames = fe.transform() lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid) lsi.fit_transform(n_components=6) return cache_dir, dsid, filenames, lsi.mid
def test_lsi(): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_components = 5 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt') uuid, filenames = fe.transform() lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform( n_components=n_components) # TODO unused variables assert lsi_res.components_.shape == (n_components, fe.n_features_) assert lsi._load_pars() is not None lsi._load_model() # test pipeline lsi.list_models() lsi.delete()
basename = os.path.dirname(__file__) cache_dir = check_cache() EPSILON = 1e-4 data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") fe = FeatureVectorizer(cache_dir=cache_dir) vect_uuid = fe.setup() fe.ingest(data_dir, file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid) lsi.fit_transform(n_components=6) ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) _test_cases = itertools.product( [False, True], ["LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor", "NearestCentroid"], [None, 'fast']) # 'MLPClassifier', 'ensemble-stacking' not supported in production the moment _test_cases = [x for x in _test_cases if not (x[1].startswith("Nearest") and x[2])]
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(n_features=n_features, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform(n_components=100) assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values, method=method) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels = cm(2) htree = cat._get_htree(cat.pipeline.data) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError