示例#1
0
class ClusterHolder:
    def __init__(self, cluid):
        self.cluid = cluid
        self.retwsetdict = dict()
        self.tokens = IdFreqDict()
        self.twnum = 0

    def update_by_retwset(self, retwset, factor):
        set_mstid = retwset.master_twhid
        if factor > 0:
            # if retw_mstid in self._retwsetdict:
            #     raise ValueError('cannot move in retwset since retwid {} is in cluster'.format(retw_mstid))
            self.retwsetdict[set_mstid] = retwset
            for twh in retwset.get_twharr():
                self.update_by_twh(twh, 1)
        else:
            # if retw_mstid not in self._retwsetdict:
            #     raise ValueError('cannot move out retwset since retwid {} not in cluster'.format(retw_mstid))
            for twh in retwset.get_twharr():
                self.update_by_twh(twh, -1)
            self.retwsetdict.pop(set_mstid)

    def update_by_twh(self, twh, factor):
        twh_tokens = twh.tokens
        if factor > 0:
            self.tokens.merge_freq_from(twh_tokens)
            self.twnum += 1
        else:
            self.tokens.drop_freq_from(twh_tokens)
            self.twnum -= 1
示例#2
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     """
     实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus,
     并在其上进行完整的聚类过程,耗时较多;
     若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息,
     并对new_twharr中的每条推特包装对象采样已有的聚类对象
     :param old_twharr: list,元素类型为 TweetHolder
     :param new_twharr: list,元素类型为 TweetHolder
     :param iter_num: 聚类循环所用的迭代次数
     :return:
     """
     cludict = self.cludict
     """ recalculate the valid dictionary """
     valid_dict = IdFreqDict()
     D = len(old_twharr) + len(new_twharr)
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for cluster in cludict.values():
         cluster.clear()
     for old_twh in old_twharr:
         # if old_twh.get_cluid() not in cludict:
         #     continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if old_twharr:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         cluster = cludict[new_cluid]
         new_twh.update_cluster(cluster)
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(
             i + 1, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[cluid] = ClusterHolder(cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
示例#3
0
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
示例#4
0
    class ClusterHolder:
        def __init__(self, cluid):
            self.cluid = cluid
            self.tokens = IdFreqDict()
            self.twnum = 0

        def update_by_twh(self, twh, factor):
            twh_tokens = twh.ifd
            if factor > 0:
                self.tokens.merge_freq_from(twh_tokens, newest=False)
                self.twnum += 1
            else:
                self.tokens.drop_freq_from(twh_tokens, newest=False)
                self.twnum -= 1
示例#5
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     cludict = self.cludict
     valid_dict = IdFreqDict()
     if len(old_twharr) > 0:
         for cluster in cludict.values():
             cluster.clear()
     D = len(old_twharr) + len(new_twharr)
     """ recalculate the valid dictionary """
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for old_twh in old_twharr:
         if old_twh.get_cluid() not in cludict:
             continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if len(old_twharr) > 0:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         new_twh.update_cluster(cludict[new_cluid])
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(i, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[self.max_cluid] = ClusterHolder(self.max_cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
示例#6
0
class ClusterHolder:
    def __init__(self, cluid):
        self.cluid = cluid
        self.twhdict = dict()
        self.tokens = IdFreqDict()
        self.twnum = 0

    """ basic functions """

    def get_twharr(self):
        return list(self.twhdict.values())

    def get_twarr(self):
        return [twh.tw for twh in self.twhdict.values()]

    def get_lbarr(self):
        return [twh[tk.key_event_label] for twh in self.twhdict.values()]

    def clear(self):
        self.twhdict.clear()
        self.tokens.clear()
        self.twnum = 0

    def update_by_twh(self, twh, factor):
        twh_tokens = twh.valid_tokens
        twh_id = twh.id
        if factor > 0:
            self.tokens.merge_freq_from(twh_tokens, newest=False)
            self.twhdict[twh_id] = twh
            self.twnum += 1
        else:
            self.tokens.drop_freq_from(twh_tokens, newest=False)
            if twh_id in self.twhdict:
                self.twhdict.pop(twh_id)
            self.twnum -= 1

    """ extra functions """

    def get_rep_label(self, rep_thres):
        lb_count = Counter(self.get_lbarr())
        max_label, max_lbnum = lb_count.most_common(1)[0]
        rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label
        return rep_label
示例#7
0
class ClusterHolder:
    def __init__(self, cluid):
        self.cluid = cluid
        self.twhdict = dict()
        self.tokens = IdFreqDict()
        self.twnum = 0

    """ basic functions """

    def get_twharr(self):
        """
        返回聚类当前持有的推特包装对象的列表,不考虑排列顺序
        :return:  list,每个元素类型为TweetHolder
        """
        return list(self.twhdict.values())

    def get_twarr(self):
        """
        返回聚类当前持有的推特对象的列表,不考虑排列顺序
        :return: list,推特列表
        """
        return [twh.tw for twh in self.twhdict.values()]

    def get_lbarr(self):
        """
        返回聚类当前的推特对象所持有的标记(若存在该信息)的列表,不考虑排列顺序
        :return: list,元素为int,表示推特原本的标记值(原本属于哪个聚类)
        """
        return [twh[tk.key_event_label] for twh in self.twhdict.values()]

    def clear(self):
        """
        清空当前聚类的统计信息,包括分词表、推特列表、推特计数
        :return:
        """
        self.twhdict.clear()
        self.tokens.clear()
        self.twnum = 0

    def update_by_twh(self, twh, factor):
        """
        将输入的推特包装对象加入/移出当前聚类,并根据其 valid_tokens 更新当前聚类的分词表等统计信息
        :param twh: TweetHolder,要加入的推特包装对象
        :param factor: int,1表示加入,0表示移出
        :return:
        """
        twh_tokens = twh.valid_tokens
        twh_id = twh.id
        if factor > 0:
            self.tokens.merge_freq_from(twh_tokens, newest=False)
            self.twhdict[twh_id] = twh
            self.twnum += 1
        else:
            self.tokens.drop_freq_from(twh_tokens, newest=False)
            if twh_id in self.twhdict:
                self.twhdict.pop(twh_id)
            self.twnum -= 1

    """ extra functions """

    def get_rep_label(self, rep_thres):
        """
        计算当前聚类中是否存在标记数占推特列表总数的比例大于阈值 rep_thres 的标记
        :param rep_thres: float,判定阈值
        :return: int,若存在足够占比的标记则返回该标记,否则返回-1
        """
        lb_count = Counter(self.get_lbarr())
        max_label, max_lbnum = lb_count.most_common(1)[0]
        rep_label = -1 if max_lbnum < self.twnum * rep_thres else max_label
        return rep_label