def __init__(self, email_collection: EmailCollection, target = "body", vectorizer = None, algorithm = None): """ Create a clusterizer object @arg string email_collection this should be an EmailCollection """ # Input Source email Collection to clusterize self.email_collection = email_collection # List of text to clusterize self.raws = [] # Output Clusterized collections of email self.clusters = [] # Current clustering algorithm self.algorithm = algorithm # Current vectorizer self.vectorizer = vectorizer # Current Cleaner self.cleaner = Cleaner() # Clusters ID self.labels = [] # Email subject or Email body as the target of analysis self.target = target self.log = logging.getLogger(__name__) self.target_factor = 1
class Clusterizer(object): def __init__(self, email_collection: EmailCollection, target = "body", vectorizer = None, algorithm = None): """ Create a clusterizer object @arg string email_collection this should be an EmailCollection """ # Input Source email Collection to clusterize self.email_collection = email_collection # List of text to clusterize self.raws = [] # Output Clusterized collections of email self.clusters = [] # Current clustering algorithm self.algorithm = algorithm # Current vectorizer self.vectorizer = vectorizer # Current Cleaner self.cleaner = Cleaner() # Clusters ID self.labels = [] # Email subject or Email body as the target of analysis self.target = target self.log = logging.getLogger(__name__) self.target_factor = 1 def set_algorithm(self, algorithm: Algo) : """ set Algo to use """ self.algorithm = algorithm def set_vectorizer(self, vectorizer: Vectorizer) : """ set Vectorizer to use """ self.vectorizer = vectorizer def run_vectorizer(self): """ start vectorizer """ self.log.info(str(self.vectorizer.__class__.__name__)+" running ...") # fill te vectorizer input self.vectorizer.raws = self.raws self.vectorizer.vectorize() # meta programmation : associate vector to mails for index in range(len(self.vectorizer.matrix)): self.email_collection[index].vector = self.vectorizer.matrix[index] def run_cleaner(self): """ clean collection and fill vectorizer """ self.raws.clear() self.log.info("Cleaner running (target: "+self.target+")") for email in self.email_collection: source = str() if self.target == "both": subject_clean = self.cleaner.clean(email.get_subject()) body_clean = self.cleaner.clean(email.get_body()) len_subject = len(subject_clean.split(" ")) len_body = len(body_clean.split(" ")) # compute coef coef = int(len_body / len_subject * self.target_factor) add = " ".join(subject_clean.split(" ") * coef) clean = body_clean + " " + add else: if self.target == "subject": source = email.get_subject() if self.target == "body": source = email.get_body() clean = self.cleaner.clean(source) self.raws.append(clean) email.clean = clean # Meta programmation ! Create new variable def run_algorithm(self): self.log.info(str(self.algorithm.__class__.__name__)+" running ...") self.labels = self.algorithm.run(self.vectorizer.matrix) # self.groups = self.order_groups(self.groups) self.compute_clusters() def compute(self): self.log.info("Start clustering on collection {} emails".format(self.email_collection.count())) self.run_cleaner() self.run_vectorizer() self.run_algorithm() self.show_log() def show_log(self): s_scores = self.silhouette_samples() self.log.info("Results : ") for index in range(len(self.clusters)): self.log.info("\t{} email(s)\t[{:5}] [{:5}] ".format( self.clusters[index].count(), round(self.clusters[index].get_similarity(),3), round(s_scores[index],2))) self.log.info("total silhouette score: {:5}".format(round(self.silhouette_score(),3))) self.log.info("total linkage score: {:5}".format(round(self.cluster_linkage(),3))) def cluster_linkage(self): centroids = [] for coll in self.clusters: centroids.append(coll.get_centroid()) ref = centroids[0] return tools.avg_distance(centroids, ref) def silhouette_score(self): array = np.array(self.vectorizer.matrix) return metrics.silhouette_score(array, self.labels, metric='cosine') def silhouette_samples(self): array = np.array(self.vectorizer.matrix) return metrics.silhouette_samples(array, self.labels, metric='cosine') def compute_clusters(self): '''To recover the docs by cluster''' self.log.info("Compute Email clustering ...") n_cluster = (max(self.labels)) self.clusters = [EmailCollection("cluster_"+str(i)) for i in range(n_cluster+1)] for index in range(len(self.labels)): gid = self.labels[index] self.clusters[gid].add_email(self.email_collection[index]) def to_json(self): data = {} meta = {} meta["date"] = datetime.datetime.now().isoformat() meta["collection_size"] = self.email_collection.count() meta["target"] = self.target meta["cluster_count"] = len(self.clusters) meta["vectorizer"] = self.vectorizer.__class__.__name__ meta["algorithm"] = self.algorithm.__class__.__name__ meta["linkage"] = self.cluster_linkage() clusters = [] for index in range(len(self.clusters)): cluster = {} cluster["count"] = self.clusters[index].count() cluster["files"] = [] cluster["similarity"] = self.clusters[index].get_similarity() for email in self.clusters[index]: e = {} e["filename"] = email.filename e["subject"] = email.get_subject() cluster["files"].append(e) clusters.append(cluster) data["meta"] = meta data["clusters"] = clusters return json.dumps(data) def save_json(self, filename): with open(filename,"w") as file: file.write(self.to_json()) def scores(self): intra = sum([col.get_similarity() for col in self.clusters]) / len(self.clusters) extra = self.cluster_linkage() silho = self.silhouette_score() return intra, extra, silho def print_table(self): for index in range(len(self.clusters)): for email in self.clusters[index]: print(email.get_name(), self.clusters[index].name, sep="\t")