Пример #1
0
def test_search_filenames(use_hashing):
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt',
              use_hashing=use_hashing)  # TODO unused (overwritten on the next line)
    uuid, filenames = fe.transform()

    assert_equal(fe._pars['filenames'], filenames)



    for low, high, step in [(0, 1, 1),
                            (0, 4, 1),
                            (3, 1, -1)]:
        idx_slice = list(range(low, high, step))
        filenames_slice = [filenames[idx] for idx in idx_slice]
        idx0 = fe.search(filenames_slice)
        assert_equal(idx0, idx_slice)
        assert_equal(filenames_slice, fe[idx0])

    with pytest.raises(KeyError):
        fe.search(['DOES_NOT_EXIST.txt'])

    if not use_hashing:
        n_top_words = 5
        terms = fe.query_features([2, 3, 5], n_top_words=n_top_words)
        assert len(terms) == n_top_words

    fe.list_datasets()
Пример #2
0
def test_feature_extraction_storage():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir, file_pattern='.*\d.txt')
    db = pd.read_pickle(os.path.join(cache_dir, 'ediscovery_cache',
                                     uuid, 'db'))
    assert 'file_path' not in db.columns
Пример #3
0
def fd_setup(**fe_options):
    basename = os.path.dirname(__file__)
    cache_dir = check_cache()
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_features = 110000
    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(n_features=n_features,
                    use_hashing=True,
                    stop_words='english',
                    **fe_options)
    fe.ingest(data_dir, file_pattern='.*\d.txt')
    return cache_dir, uuid, fe.filenames_, fe
Пример #4
0
def test_search_filenames(use_hashing):
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    assert fe.db_ is not None

    for low, high, step in [(0, 1, 1),
                            (0, 4, 1),
                            (3, 1, -1)]:
        idx_slice = list(range(low, high, step))
        filenames_slice = [fe.filenames_[idx] for idx in idx_slice]
        idx0 = fe.db_._search_filenames(filenames_slice)
        assert_equal(idx0, idx_slice)
        assert_equal(filenames_slice, fe[idx0])

    with pytest.raises(NotFound):
        fe.db_._search_filenames(['DOES_NOT_EXIST.txt'])

    if not use_hashing:
        n_top_words = 5
        terms = fe.query_features([2, 3, 5], n_top_words=n_top_words)
        assert len(terms) == n_top_words

    fe.list_datasets()
Пример #5
0
def fd_setup(**fe_options):
    basename = os.path.dirname(__file__)
    cache_dir = check_cache()
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_features = 110000
    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(
        data_dir,
        file_pattern='.*\d.txt',
        n_features=n_features,
        use_hashing=True,
        stop_words='english',
        **fe_options)  # TODO unused variable (overwritten on the next line)
    uuid, filenames = fe.transform()
    return cache_dir, uuid, filenames, fe
Пример #6
0
def fd_setup():
    basename = os.path.dirname(__file__)
    cache_dir = check_cache()
    np.random.seed(1)
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_features = 110000
    fe = FeatureVectorizer(cache_dir=cache_dir)
    dsid = fe.setup(n_features=n_features,
                    use_hashing=False,
                    stop_words='english',
                    min_df=0.1,
                    max_df=0.9)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid)
    lsi.fit_transform(n_components=6)
    return cache_dir, dsid, fe.filenames_, lsi.mid
Пример #7
0
def fd_setup():
    basename = os.path.dirname(__file__)

    cache_dir = check_cache()

    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")

    n_features = 110000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir,
                         file_pattern='.*\d.txt',
                         n_features=n_features,
                         use_hashing=True,
                         stop_words='english')
    uuid, filenames = fe.transform()
    return cache_dir, uuid, filenames, fe
Пример #8
0
def test_email_parsing():
    data_dir = os.path.join(basename, "..", "data",
                            "fedora-devel-list-2008-October")
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir)

    email_md = fe.parse_email_headers()
    assert len(fe.filenames_) == len(email_md)

    fe.delete()
Пример #9
0
def test_lsi():
    basename = os.path.dirname(__file__)

    cache_dir = check_cache()
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")

    n_features = 110000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir,
                         file_pattern='.*\d.txt',
                         n_features=n_features)
    uuid, filenames = fe.transform()
    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    lsi = LSI(cache_dir=cache_dir, dsid=uuid)
    lsi_res, exp_var = lsi.transform(n_components=100)
    lsi_id = lsi.mid
    assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid
    assert lsi.get_path(lsi_id) is not None
    assert lsi._load_pars(lsi_id) is not None
    lsi.load(lsi_id)

    mask = ground_truth.is_relevant.values == 1

    for accumulate in ['nearest-max', 'centroid-max']:
        #'nearest-diff', 'nearest-combine', 'stacking']:
        _, X_train, Y_train_val, Y_train, X_pred, Y_pred, ND_train = lsi.predict(
            ground_truth.index.values[mask],
            ground_truth.index.values[~mask],
            accumulate=accumulate)
        scores = classification_score(ground_truth.index.values,
                                      ground_truth.is_relevant.values, X_pred,
                                      Y_pred)
        #yield assert_allclose, scores['precision_score'], 1
        #yield assert_allclose, scores['recall_score'], 1

    lsi.list_models()

    lsi.delete()
Пример #10
0
def fd_setup():
    basename = os.path.dirname(__file__)
    cache_dir = check_cache()
    np.random.seed(1)
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_features = 110000
    fe = FeatureVectorizer(cache_dir=cache_dir)
    dsid = fe.preprocess(
        data_dir,
        file_pattern='.*\d.txt',
        n_features=n_features,
        use_hashing=False,
        stop_words='english',
        min_df=0.1,
        max_df=0.9
    )  # TODO unused variable 'uuid' (overwritten on the next line)
    dsid, filenames = fe.transform()

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid)
    lsi.fit_transform(n_components=6)
    return cache_dir, dsid, filenames, lsi.mid
Пример #11
0
def test_lsi():
    basename = os.path.dirname(__file__)

    cache_dir = check_cache()
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_components = 5

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt')
    uuid, filenames = fe.transform()

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid)
    lsi_res, exp_var = lsi.fit_transform(
        n_components=n_components)  # TODO unused variables
    assert lsi_res.components_.shape == (n_components, fe.n_features_)
    assert lsi._load_pars() is not None
    lsi._load_model()

    # test pipeline

    lsi.list_models()
    lsi.delete()
Пример #12
0
def test_lsi():
    basename = os.path.dirname(__file__)

    cache_dir = check_cache()
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_features = 110000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(
        data_dir, file_pattern='.*\d.txt', n_features=n_features
    )  # TODO unused variable (overwritten on the next line)
    uuid, filenames = fe.transform()
    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    lsi = LSI(cache_dir=cache_dir, dsid=uuid)
    lsi_res, exp_var = lsi.transform(n_components=100)  # TODO unused variables
    lsi_id = lsi.mid
    assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid
    assert lsi.get_path(lsi_id) is not None
    assert lsi._load_pars(lsi_id) is not None
    lsi.load(lsi_id)

    idx_gt = lsi.fe.search(ground_truth.index.values)
    idx_all = np.arange(lsi.fe.n_samples_, dtype='int')

    for accumulate in ['nearest-max', 'centroid-max']:
        #'nearest-diff', 'nearest-combine', 'stacking']:
        _, Y_train, Y_pred, ND_train = lsi.predict(
            idx_gt, ground_truth.is_relevant.values, accumulate=accumulate)
        scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                      idx_all, Y_pred)
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.3)

    lsi.list_models()
    lsi.delete()
Пример #13
0
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
    cache_dir = check_cache()
    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range,
                    use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    res2 = fe._load_features(uuid)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()

    assert_allclose(normalize(res2).data, res2.data)  # data is l2 normalized

    fe.delete()
Пример #14
0
def test_feature_extraction_cyrillic(use_hashing):
    data_dir = os.path.join(basename, "..", "data", "ds_002", "raw")
    cache_dir = check_cache()
    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    res2 = fe._load_features(uuid)

    filenames = fe.filenames_
    fe._filenames = None
    filenames2 = fe.filenames_

    assert_equal(filenames2, filenames)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2),\
        "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()
    fe.delete()
Пример #15
0
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing):
    cache_dir = check_cache()
    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt',
            analyzer=analyzer, ngram_range=ngram_range, use_hashing=use_hashing)
    uuid, filenames = fe.transform()

    filenames2, res2 = fe.load(uuid)
    assert_equal(filenames2, filenames)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()
    fe.delete()
Пример #16
0
def test_feature_extraction_nfeatures(n_features, use_idf, use_hashing):
    cache_dir = check_cache()

    use_hashing = (use_hashing == 'hashed')
    use_idf = (use_idf == 'IDF')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(n_features=n_features,
                    use_idf=use_idf, use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    res2 = fe._load_features(uuid)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), \
        "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()

    assert res2.shape[1] == fe.n_features_

    fe.delete()
Пример #17
0
def test_feature_extraction(analyzer, stop_words, ngram_range, use_idf, sublinear_tf, binary, use_hashing):
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features,
            analyzer=analyzer, stop_words=stop_words, ngram_range=ngram_range,
            use_idf=use_idf, binary=binary, use_hashing=use_hashing, sublinear_tf=sublinear_tf)  # TODO unused (overwritten on the next line)
    uuid, filenames = fe.transform()

    filenames2, res2 = fe.load(uuid)
    assert_equal(filenames2, filenames)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()


    fe.delete()
Пример #18
0
def test_feature_extraction_weighting(use_idf, sublinear_tf, binary,
                                      use_hashing):
    cache_dir = check_cache()

    use_idf = (use_idf == 'IDF')
    sublinear_tf = (sublinear_tf == 'sublinear TF')
    binary = (binary == 'binary')
    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(use_idf=use_idf, binary=binary,
                    use_hashing=use_hashing, sublinear_tf=sublinear_tf)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    res2 = fe._load_features(uuid)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), \
        "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()
    assert_allclose(normalize(res2).data, res2.data)  # data is l2 normalized

    fe.delete()
Пример #19
0
def test_threading():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup()
    fe.ingest(data_dir=data_dir)
    fe.parse_email_headers()

    cat = _EmailThreadingWrapper(cache_dir=cache_dir, parent_id=uuid)

    tree = cat.thread()
    cat.get_params()

    tree_ref = [{
        'id':
        0,
        'parent':
        None,
        'children': [{
            'id': 1,
            'children': [],
            'parent': 0
        }, {
            'id':
            2,
            'parent':
            0,
            'children': [{
                'id': 3,
                'children': [],
                'parent': 2
            }, {
                'id': 4,
                'children': [],
                'parent': 2
            }]
        }]
    }]

    assert [el.to_dict() for el in tree] == tree_ref

    assert len(fe.filenames_) == sum([el.size for el in tree])
    assert len(fe.filenames_) == 5
    assert len(tree[0].flatten()) == 5
Пример #20
0
def test_feature_extraction_weighting(use_idf, sublinear_tf, binary, use_hashing):
    cache_dir = check_cache()

    use_idf = (use_idf == 'IDF')
    sublinear_tf = (sublinear_tf == 'sublinear TF')
    binary = (binary == 'binary')
    use_hashing = (use_hashing == 'hashed')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', 
            use_idf=use_idf, binary=binary, use_hashing=use_hashing, sublinear_tf=sublinear_tf)
    uuid, filenames = fe.transform()

    filenames2, res2 = fe.load(uuid)
    assert_equal(filenames2, filenames)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()


    fe.delete()
Пример #21
0
def test_feature_extraction_nfeatures(n_features, use_idf, use_hashing):
    cache_dir = check_cache()

    use_hashing = (use_hashing == 'hashed')
    use_idf = (use_idf == 'IDF')

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt',
            n_features=n_features,
            use_idf=use_idf, use_hashing=use_hashing)
    uuid, filenames = fe.transform()

    filenames2, res2 = fe.load(uuid)
    assert_equal(filenames2, filenames)
    assert isinstance(res2,  np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2)

    assert np.isfinite(res2.data).all()

    assert res2.shape[1] == fe.n_features_

    fe.delete()
Пример #22
0
from freediscovery.metrics import categorization_score
from freediscovery.exceptions import OptionalDependencyMissing, WrongParameter
from .run_suite import check_cache


basename = os.path.dirname(__file__)


cache_dir = check_cache()

EPSILON = 1e-4


data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")

fe = FeatureVectorizer(cache_dir=cache_dir)
vect_uuid = fe.setup()
fe.ingest(data_dir, file_pattern='.*\d.txt')


lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid)
lsi.fit_transform(n_components=6)

ground_truth = parse_ground_truth_file(
                        os.path.join(data_dir, "..", "ground_truth_file.txt"))

_test_cases = itertools.product(
                       [False, True],
                       ["LinearSVC", "LogisticRegression", 'xgboost',
                        "NearestNeighbor", "NearestCentroid"],
                       [None, 'fast'])
from freediscovery.categorization import Categorizer
from freediscovery.io import parse_ground_truth_file
from freediscovery.utils import classification_score
from freediscovery.exceptions import OptionalDependencyMissing
from ..utils import _silent
from .run_suite import check_cache

basename = os.path.dirname(__file__)

cache_dir = check_cache()

data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")

n_features = 20000

fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess(data_dir,
                     file_pattern='.*\d.txt',
                     n_features=n_features,
                     binary=True,
                     use_idf=False,
                     norm=None)
uuid, filenames = fe.transform()

ground_truth = parse_ground_truth_file(
    os.path.join(data_dir, "..", "ground_truth_file.txt"))


@pytest.mark.parametrize(
    'method, cv',
    itertools.product(
Пример #24
0
def test_features_hashing(use_hashing, use_lsi, method):
    # check that models work both with and without hashing

    cache_dir = check_cache()

    n_features = 20000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.setup(n_features=n_features, use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid)
    lsi_res, exp_var = lsi.fit_transform(n_components=100)
    assert lsi._load_pars() is not None
    lsi._load_model()

    if method == 'Categorization':
        if use_lsi:
            parent_id = lsi.mid
            method = 'NearestNeighbor'
        else:
            parent_id = uuid
            method = 'LogisticRegression'
        cat = _CategorizerWrapper(cache_dir=cache_dir,
                                  parent_id=parent_id,
                                  cv_n_folds=2)
        cat.fe.db_.filenames_ = cat.fe.filenames_
        index = cat.fe.db_._search_filenames(ground_truth.file_path.values)

        try:
            coefs, Y_train = cat.fit(index,
                                     ground_truth.is_relevant.values,
                                     method=method)
        except OptionalDependencyMissing:
            raise SkipTest

        Y_pred, md = cat.predict()
        X_pred = np.arange(cat.fe.n_samples_, dtype='int')
        idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values)

        scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                      X_pred, np.argmax(Y_pred, axis=1))
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.7)
        cat.delete()
    elif method == 'DuplicateDetection':
        dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid)
        try:
            dd.fit()
        except ImportError:
            raise SkipTest
        cluster_id = dd.query(distance=10)
    elif method == 'Clustering':
        if not use_hashing:
            if use_lsi:
                parent_id = lsi.mid
                method = 'birch'
            else:
                parent_id = uuid
                method = 'k_means'
            cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id)
            cm = getattr(cat, method)
            labels = cm(2)

            htree = cat._get_htree(cat.pipeline.data)

            terms = cat.compute_labels(n_top_words=10)
        else:
            with pytest.raises(NotImplementedError):
                _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid)
    else:
        raise ValueError
Пример #25
0
def test_feature_extraction(analyzer, stop_words, ngram_range, use_idf,
                            sublinear_tf, binary, use_hashing):
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir,
                         file_pattern='.*\d.txt',
                         n_features=n_features,
                         analyzer=analyzer,
                         stop_words=stop_words,
                         ngram_range=ngram_range,
                         use_idf=use_idf,
                         binary=binary,
                         use_hashing=use_hashing,
                         sublinear_tf=sublinear_tf)
    uuid, filenames = fe.transform()

    filenames2, res2 = fe.load(uuid)
    assert_equal(filenames2, filenames)
    assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(
        res2), "not an array {}".format(res2)

    fe.search(['0.7.47.117435.txt'])
    fe.search(['DOES_NOT_EXIST.txt'])
    fe.list_datasets
    assert np.isfinite(res2.data).all()

    if not use_hashing:
        n_top_words = 5
        terms = fe.query_features([2, 3, 5], n_top_words=n_top_words)
        assert len(terms) == n_top_words

    fe.delete()
Пример #26
0
from freediscovery.lsi import _LSIWrapper
from freediscovery.categorization import _CategorizerWrapper
from freediscovery.io import parse_ground_truth_file
from freediscovery.metrics import categorization_score
from freediscovery.exceptions import OptionalDependencyMissing
from .run_suite import check_cache

basename = os.path.dirname(__file__)

cache_dir = check_cache()

EPSILON = 1e-4

data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")

fe = FeatureVectorizer(cache_dir=cache_dir)
vect_uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt')
vect_uuid, filenames = fe.transform()

lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid)
lsi.fit_transform(n_components=6)

ground_truth = parse_ground_truth_file(
    os.path.join(data_dir, "..", "ground_truth_file.txt"))

_test_cases = itertools.product(
    [False, True],
    [
        "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor",
        "NearestCentroid"
    ],
Пример #27
0
def test_sampling_filenames():
    cache_dir = check_cache()

    fe_pars = {'binary': True, 'norm': None, 'sublinear_tf': False}

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt',
              use_hashing=True, **fe_pars)  # TODO unused (overwritten on the next line)
    uuid, filenames = fe.transform()
    fnames, X = fe.load(uuid)

    # don't use any sampling
    fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid,
                                    sampling_filenames=None)
    fnames_s, X_s = fes.load(uuid)
    pars = fe._load_pars()
    assert_array_equal(fnames, fnames_s)
    assert_array_equal(X.data, X_s.data)
    assert fes.n_samples_ == len(fnames)

    fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid,
                                    sampling_filenames=fnames[::-1])

    assert fes.sampling_index is not None
    fnames_s, X_s = fes.load(uuid)
    pars_s = fes._load_pars_sampled()
    assert_array_equal(fnames[::-1], fnames_s)
    assert_array_equal(X[::-1,:].data, X_s.data)
    for key in pars:
        if key == 'filenames':
            assert pars[key][::-1] == pars_s[key]
        else:
            assert pars[key] == pars_s[key]

    # repeat twice the filenames
    fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid,
                                    sampling_filenames=(fnames+fnames))

    assert fes.sampling_index is not None
    fnames_s, X_s = fes.load(uuid)
    pars_s = fes._load_pars_sampled()
    assert_array_equal(fnames + fnames, fnames_s )
    assert_array_equal(X.data, X_s[:len(fnames)].data)
    assert_array_equal(X.data, X_s[len(fnames):].data)
    assert fes.n_samples_ == len(fnames)*2
    #for key in pars:
    #    assert pars[key] == pars_s[key]

    # downsample the filenames
    N = len(fnames)//2

    np.random.seed(1)

    idx = np.random.choice(fe.n_samples_, size=(N,))
    fnames_s_in = np.array(fnames)[idx].tolist()

    fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid,
            sampling_filenames=fnames_s_in)

    assert fes.sampling_index is not None
    fnames_s, X_s = fes.load(uuid)
    pars_s = fes._load_pars_sampled()
    assert_array_equal(fnames_s_in, fnames_s )
    assert_array_equal(X[idx].data, X_s.data)
    assert fes.n_samples_ == N

    fe.delete()
Пример #28
0
def test_df_filtering(use_hashing, min_df, max_df):
    cache_dir = check_cache()


    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir, use_hashing=use_hashing, min_df=min_df, max_df=max_df)
    uuid, filenames = fe.transform()

    _, X = fe.load(uuid)

    fe2 = FeatureVectorizer(cache_dir=cache_dir)
    uuid2 = fe2.preprocess(data_dir, use_hashing=use_hashing)
    uuid2, filenames = fe2.transform()

    _, X2 = fe2.load(uuid2)

    if use_hashing:
        assert X.shape[1] == X2.shape[1] # min/max_df does not affect the number of features
    else:
        assert X.shape[1] < X2.shape[1] # min/max_df removes some features


    fe.delete()
pd.options.display.float_format = '{:,.3f}'.format

data_dir = "../freediscovery_shared/tar_fd_benchmark"
examples_to_server_path = "../" # relative path between this file and the FreeDiscovery source folder

BASE_URL = "http://localhost:5001/api/v0"  # FreeDiscovery server URL


# # 1. Feature extraction (non hashed)

# In[2]:

n_features = 30000
cache_dir = '/tmp/'

fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess("../"+data_dir+'/data',
                             n_features=n_features, use_hashing=False,
                             use_idf=True, stop_words='english')
uuid, filenames  = fe.transform()


# # 2. Document Clustering (LSI + K-Means)

# In[4]:

cat = Clustering(cache_dir=cache_dir, dsid=uuid)

n_clusters = 10
n_top_words = 6
lsi_components = 50
Пример #30
0
    ds = load_dataset(dataset_name, load_ground_truth=True, cache_dir=cache_dir)


    # To use a custom dataset, simply specify the following variables
    data_dir = ds['data_dir']
    seed_filenames = ds['seed_filenames']
    seed_y = ds['seed_y']
    ground_truth_file = ds['ground_truth_file']  # (optional)

    fe_opts = {'data_dir': data_dir,
               'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
               'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 50001,
               'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2"
              }

    fe = FeatureVectorizer(cache_dir=cache_dir)

    uuid = fe.preprocess(**fe_opts)
    uuid, filenames  = fe.transform()

    seed_index = fe.search(seed_filenames)

    cat = Categorizer(cache_dir=cache_dir, dsid=uuid)
    cat.train(seed_index, seed_y)

    predictions = cat.predict()

    gt = parse_ground_truth_file( ground_truth_file)
    idx_ref = cat.fe.search(gt.index.values)
    idx_res = np.arange(cat.fe.n_samples_, dtype='int')