def test_threading(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir=data_dir) fe.parse_email_headers() cat = _EmailThreadingWrapper(cache_dir=cache_dir, parent_id=uuid) tree = cat.thread() cat.get_params() tree_ref = [{ 'id': 0, 'parent': None, 'children': [{ 'id': 1, 'children': [], 'parent': 0 }, { 'id': 2, 'parent': 0, 'children': [{ 'id': 3, 'children': [], 'parent': 2 }, { 'id': 4, 'children': [], 'parent': 2 }] }] }] assert [el.to_dict() for el in tree] == tree_ref assert len(fe.filenames_) == sum([el.tree_size for el in tree]) assert len(fe.filenames_) == 5 assert len(tree[0].flatten()) == 5
def test_email_parsing(): data_dir = os.path.join(basename, "..", "..", "data", "fedora-devel-list-2008-October") cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) email_md = fe.parse_email_headers() assert len(fe.filenames_) == len(email_md) fe.delete()