class TweetHolder: # using_ifd = token_dict() def __init__(self, tw): self.tw = tw self.id = tw.get(tk.key_id) self.cluster = None self.tokens = IdFreqDict() self.valid_tokens = IdFreqDict() self.tokenize() def __contains__(self, key): return key in self.tw def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) def get_cluid(self): return self.cluster.cluid def update_cluid_into_tw(self): self.tw[ tk. key_event_cluid] = self.cluster.cluid if self.cluster is not None else None def tokenize(self): # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower()) tokens = [ t.strip() for t in tokens if pu.is_valid_keyword(t) and not pu.is_stop_word(t) ] for token in tokens: self.tokens.count_word(token) def validate(self, using_ifd): self.valid_tokens.clear() for word, freq in self.tokens.word_freq_enumerate(newest=False): if using_ifd.has_word(word): self.valid_tokens.count_word(word, freq) def update_cluster(self, cluster): if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0 """ basic functions """ def get_twharr(self): return list(self.twhdict.values()) def get_twarr(self): return [twh.tw for twh in self.twhdict.values()] def get_lbarr(self): return [twh[tk.key_event_label] for twh in self.twhdict.values()] def clear(self): self.twhdict.clear() self.tokens.clear() self.twnum = 0 def update_by_twh(self, twh, factor): twh_tokens = twh.valid_tokens twh_id = twh.id if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twhdict[twh_id] = twh self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) if twh_id in self.twhdict: self.twhdict.pop(twh_id) self.twnum -= 1 """ extra functions """ def get_rep_label(self, rep_thres): lb_count = Counter(self.get_lbarr()) max_label, max_lbnum = lb_count.most_common(1)[0] rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label return rep_label
class TweetHolder: # using_ifd = token_dict() def __init__(self, tw): self.tw = tw self.id = tw.get(tk.key_id) self.cluster = None self.tokens = IdFreqDict() self.valid_tokens = IdFreqDict() self.tokenize() def __contains__(self, key): return key in self.tw def __getitem__(self, key): return self.get(key) def __setitem__(self, key, value): self.setdefault(key, value) def get(self, key): return self.tw.get(key, None) def setdefault(self, key, value): self.tw.setdefault(key, value) def get_cluid(self): """ 返回当前已被分配的聚类的ID :return: int,聚类的ID编号 """ return self.cluster.cluid def update_cluid_into_tw(self): """ 更新推特对象(self.tw,dict类型)的 tk.key_event_cluid 字段为当前已被分配的聚类的ID, 若尚未被分配聚类则置为None :return: """ self.tw[ tk. key_event_cluid] = self.cluster.cluid if self.cluster is not None else None def tokenize(self): """ 将推特对象的text进行分词并保存分词结果,使用 self.tokens 进行分词计数 :return: """ # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.valid_tokenize(self.tw[tk.key_text].lower()) for token in tokens: self.tokens.count_word(token) def validate(self, using_ifd): """ 更新分词表,将 self.tokens 中存在于 using_ifd 的单词重新计数到 self.valid_tokens 中 :param using_ifd: utils.id_freq_dict.IdFreqDict,包含当前迭代中的合法分词 :return: """ self.valid_tokens.clear() for word, freq in self.tokens.word_freq_enumerate(newest=False): if using_ifd.has_word(word): self.valid_tokens.count_word(word, freq) def update_cluster(self, cluster): """ 若原本有聚类,则将当前推特从原本的 self.cluster 中分离; 并将当前推特合并至 cluster 中(若不为None),更新 self.cluster :param cluster: 目标聚类对象 :return: """ if self.cluster is not None: self.cluster.update_by_twh(self, factor=-1) self.cluster = cluster if cluster is not None: cluster.update_by_twh(self, factor=1)
class ClusterHolder: def __init__(self, cluid): self.cluid = cluid self.twhdict = dict() self.tokens = IdFreqDict() self.twnum = 0 """ basic functions """ def get_twharr(self): """ 返回聚类当前持有的推特包装对象的列表,不考虑排列顺序 :return: list,每个元素类型为TweetHolder """ return list(self.twhdict.values()) def get_twarr(self): """ 返回聚类当前持有的推特对象的列表,不考虑排列顺序 :return: list,推特列表 """ return [twh.tw for twh in self.twhdict.values()] def get_lbarr(self): """ 返回聚类当前的推特对象所持有的标记(若存在该信息)的列表,不考虑排列顺序 :return: list,元素为int,表示推特原本的标记值(原本属于哪个聚类) """ return [twh[tk.key_event_label] for twh in self.twhdict.values()] def clear(self): """ 清空当前聚类的统计信息,包括分词表、推特列表、推特计数 :return: """ self.twhdict.clear() self.tokens.clear() self.twnum = 0 def update_by_twh(self, twh, factor): """ 将输入的推特包装对象加入/移出当前聚类,并根据其 valid_tokens 更新当前聚类的分词表等统计信息 :param twh: TweetHolder,要加入的推特包装对象 :param factor: int,1表示加入,0表示移出 :return: """ twh_tokens = twh.valid_tokens twh_id = twh.id if factor > 0: self.tokens.merge_freq_from(twh_tokens, newest=False) self.twhdict[twh_id] = twh self.twnum += 1 else: self.tokens.drop_freq_from(twh_tokens, newest=False) if twh_id in self.twhdict: self.twhdict.pop(twh_id) self.twnum -= 1 """ extra functions """ def get_rep_label(self, rep_thres): """ 计算当前聚类中是否存在标记数占推特列表总数的比例大于阈值 rep_thres 的标记 :param rep_thres: float,判定阈值 :return: int,若存在足够占比的标记则返回该标记,否则返回-1 """ lb_count = Counter(self.get_lbarr()) max_label, max_lbnum = lb_count.most_common(1)[0] rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label return rep_label