Пример #1
0
from sklearn import metrics
import numpy as np
import multiprocessing as mp
from threading import Thread


# logging.basicConfig(level=logging.INFO)


collection = EmailCollection(unsec.SMALL_DATASET_PATH)


# # collection.keep_lang("fr")


engine = Clusterizer(collection, target="both", algorithm=HierarchicalAlgo(), vectorizer=TfidfVectorizer())


engine.compute()


for col in engine.clusters:
    print("==cluster==")
    for email in col:
        print("---email---")
        print("subject   :", email.get_subject())
        print("body      :", email.get_body())

        print("clean     :", email.clean)

Пример #2
0
    log = logging.getLogger(__name__)
    log.info(args.config+" has been load as configuration")

else:
    logging.disable(logging.NOTSET)




collection = EmailCollection()
collection.add_from_directory(cfg.PATH)

if hasattr(cfg,"LANG"):
    collection.keep_lang("fr")

engine = Clusterizer(collection)

engine.vectorizer            = getattr(cfg,"VECTORIZER", LogicVectorizer())
engine.algorithm             = getattr(cfg,"ALGORITHM",  HierarchicalAlgo())
engine.algorithm.n_clusters  = getattr(cfg,"N_CLUSTERS", 3)

engine.target                = getattr(cfg,"TARGET", "both")


if getattr(cfg,"ENABLE_TEST", False):
    assert hasattr(cfg,"TEST_CLUSTERING_RANGE"), "TEST_CLUSTERING_RANGE has not been defined"

    test_folder = getattr(cfg,"TEST_FOLDER", "results")
    if not os.path.exists(test_folder):
        os.makedirs(test_folder)