Пример #1
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     """
     实际执行聚类以及采样,若old_twharr为空,则认为new_twharr是corpus,
     并在其上进行完整的聚类过程,耗时较多;
     若old_twharr不为空,则认为old_twharr已经持有了之前聚类的结果信息,
     并对new_twharr中的每条推特包装对象采样已有的聚类对象
     :param old_twharr: list,元素类型为 TweetHolder
     :param new_twharr: list,元素类型为 TweetHolder
     :param iter_num: 聚类循环所用的迭代次数
     :return:
     """
     cludict = self.cludict
     """ recalculate the valid dictionary """
     valid_dict = IdFreqDict()
     D = len(old_twharr) + len(new_twharr)
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for cluster in cludict.values():
         cluster.clear()
     for old_twh in old_twharr:
         # if old_twh.get_cluid() not in cludict:
         #     continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if old_twharr:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         cluster = cludict[new_cluid]
         new_twh.update_cluster(cluster)
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(
             i + 1, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[cluid] = ClusterHolder(cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
Пример #2
0
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
Пример #3
0
 def GSDPMM_twarr(self, old_twharr, new_twharr, iter_num):
     cludict = self.cludict
     valid_dict = IdFreqDict()
     if len(old_twharr) > 0:
         for cluster in cludict.values():
             cluster.clear()
     D = len(old_twharr) + len(new_twharr)
     """ recalculate the valid dictionary """
     for twh in old_twharr + new_twharr:
         valid_dict.merge_freq_from(twh.tokens, newest=False)
     valid_dict.drop_words_by_condition(3)
     """ reallocate & parameter """
     for old_twh in old_twharr:
         if old_twh.get_cluid() not in cludict:
             continue
         old_twh.validate(valid_dict)
         old_cluster = old_twh.cluster
         old_twh.cluster = None
         old_twh.update_cluster(old_cluster)
     for new_twh in new_twharr:
         new_twh.validate(valid_dict)
         if len(old_twharr) > 0:
             new_cluid = self.sample(new_twh,
                                     D,
                                     using_max=True,
                                     no_new_clu=True)
         else:
             new_cluid = self.max_cluid
         new_twh.update_cluster(cludict[new_cluid])
     self.beta0 = self.beta * valid_dict.vocabulary_size()
     """ start iteration """
     for i in range(iter_num):
         print('  {} th clustering, clu num: {}'.format(i, len(cludict)))
         for twh in new_twharr:
             cluster = twh.cluster
             twh.update_cluster(None)
             if cluster.twnum == 0:
                 cludict.pop(cluster.cluid)
             cluid = self.sample(twh, D, using_max=(i == iter_num - 1))
             if cluid not in cludict:
                 self.max_cluid = cluid
                 cludict[self.max_cluid] = ClusterHolder(self.max_cluid)
             twh.update_cluster(cludict[cluid])
     for twh in new_twharr:
         twh.update_cluid_into_tw()
Пример #4
0
def get_tokens(file_list):
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for file in file_list:
        twarr = fu.load_array(file)
        total_doc_num += len(twarr)
        for tw in twarr:
            tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower())
            real_tokens = list()
            for token in tokens:
                if len(token) >= 16:
                    real_tokens.extend(pu.segment(token))
                else:
                    real_tokens.append(token)
            for token in real_tokens:
                if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token):
                    id_freq_dict.count_word(token)
    id_freq_dict.drop_words_by_condition(2)
    print(id_freq_dict.vocabulary_size())
    return id_freq_dict, total_doc_num
class WordFreqCounter:
    def __init__(self, capignore=True, worddict=None):
        self.doc_num = 0
        self.capignore = capignore

        self.worddict = worddict if worddict else IdFreqDict()
        self.posdict = IdFreqDict()
        # pos_dict_file = os.path.abspath(os.path.dirname(__file__)) + os.path.sep + 'posdict.txt'
        # self.posdict.load_worddict(pos_dict_file)
        # self.notional = {'NN': 0, 'NNP': 0, 'NNPS': 0, 'NNS': 0, 'RB': 0, 'RBR': 0, 'RBS': 0,
        #                  'UH': 0, 'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, }
        # self.verb = {'VB': 0, 'VBD': 0, 'VBG': 0, 'VBN': 0, 'VBP': 0, 'VBZ': 0, }

    def vocabulary_size(self):
        # return self.worddict.vocabulary_size() + self.posdict.vocabulary_size()
        return self.worddict.vocabulary_size()

    @staticmethod
    def is_valid_wordlabel(wordlabel):
        isnotentity = wordlabel[1].startswith('O')
        return isnotentity

    def calculate_idf(self):
        if self.doc_num == 0:
            raise ValueError('No valid word has been recorded yet.')
        for word in self.worddict.dictionary:
            df = self.worddict.dictionary[word]['df']
            self.worddict.dictionary[word]['idf'] = 10 / np.log(
                (self.doc_num + 1) / df)

    def feature_matrix_of_twarr(self, twarr):
        mtx = list()
        for tw in twarr:
            idfvec, added, num_entity = self.wordlabel_vector(
                tw[tk.key_wordlabels])
            mtx.append(idfvec * (np.log(len(added) + 1) + 1) *
                       (np.log(num_entity + 1) + 1))
        return np.array(mtx)

    def wordlabel_vector(self, wordlabels):
        added_word_dict = dict()
        word_vector = np.array([0] * self.worddict.vocabulary_size(),
                               dtype=np.float32)
        pos_vector = np.array([0] * self.posdict.vocabulary_size(),
                              dtype=np.float32)
        for wordlabel in wordlabels:
            word = wordlabel[0].lower().strip(
                "#") if self.capignore else wordlabel[0]
            # word = get_root_word(word) if wordlabel[2] in self.verb else word
            # if not wordlabel[0].lower().strip("#") == word:
            #     print(wordlabel[2], wordlabel[0].lower().strip("#"), '->', word)
            if not (pu.is_valid_keyword(word)
                    and self.is_valid_wordlabel(wordlabel)):
                continue
            if word in added_word_dict:
                continue
            added_word_dict[word] = True
            if not self.worddict.is_word_in_dict(word):
                pos_tag = wordlabel[2]
                pos_vector[self.posdict.word2id(pos_tag)] += 1
            else:
                wordid = self.worddict.word_2_id(word)
                word_vector[wordid] = self.worddict.dictionary[word]['idf']
        added_word = sorted(added_word_dict.keys())
        added_entity = sorted(
            [1 for w in wordlabels if not self.is_valid_wordlabel(w)])
        return word_vector, added_word, len(added_entity)
        # return np.concatenate([word_vector, pos_vector]), added_word, len(added_entity)

    def expand_dict_and_count_df_from_wordlabel(self, wordlabels):
        added_word_dict = dict()
        for wordlabel in wordlabels:
            word = wordlabel[0].lower().strip(
                "#") if self.capignore else wordlabel[0]
            # word = get_root_word(word) if wordlabel[2] in self.verb else word
            if not (pu.is_valid_keyword(word)
                    and self.is_valid_wordlabel(wordlabel)):
                continue
            else:
                if word in added_word_dict:
                    continue
                added_word_dict[word] = True
                # "word" is now neither entity nor invalid keyword_info or duplicated word by now
                self.worddict.expand_dict_from_word(word)
                if 'df' not in self.worddict.dictionary[word]:
                    self.worddict.dictionary[word]['df'] = 1
                else:
                    self.worddict.dictionary[word]['df'] += 1
        self.doc_num += 1

    def expand_from_wordlabel_array(self, wordlabel_arr):
        for wordlabel in wordlabel_arr:
            self.expand_dict_and_count_df_from_wordlabel(wordlabel)
        self.worddict.reset_ids()

    def reserve_word_by_idf_condition(self, rsv_cond):
        self.calculate_idf()
        for word in list(self.worddict.dictionary.keys()):
            word_idf = self.worddict.dictionary[word]['idf']
            if not rsv_cond(word_idf):
                self.worddict.remove_word(word)
        self.worddict.reset_ids()

    def merge_from(self, othercounter):
        thisdict = self.worddict.dictionary
        otherdict = othercounter.worddict.dictionary
        for otherword, otherwordattr in otherdict.items():
            if otherword not in thisdict:
                thisdict[otherword] = otherwordattr
                thisdict[otherword]['idf'] /= 5

    # def most_common_words(self, rank):
    #     wordnum = self.worddict.vocabulary_size()
    #     if 0 < rank < 1:
    #         top_k = wordnum * rank
    #     elif rank > 1 and type(rank) is int:
    #         top_k = rank
    #     else:
    #         raise ValueError('rank is not a valid number' + str(rank))
    #     dic = self.worddict.dictionary
    #     return sorted(dic.keys(), key=lambda w: dic[w]['idf'])[:top_k]

    def dump_worddict(self, dict_file, overwrite=True):
        self.worddict.dump_worddict(dict_file, overwrite)

    def load_worddict(self, dict_file):
        self.worddict.load_worddict(dict_file)
Пример #6
0
            self.ifd.load_dict(ifd_file)


# pre_dict_file = getcfg().pre_dict_file
post_dict_file = getcfg().post_dict_file
token_dict = IfdGetter(post_dict_file)

# pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file]
# post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file]
# prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list]

if __name__ == '__main__':
    import utils.pattern_utils as pu

    def word_remove(word, freq):
        if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10:
            return True
        return False

    pre2post = dict(zip(pre_list, post_list))
    for pre, post in pre2post.items():
        ifd = IdFreqDict()
        ifd.load_dict(pre)
        pre_vocab = ifd.vocabulary_size()
        print('{} loaded, {} words'.format(pre, pre_vocab))
        ifd.drop_words_by_condition(word_remove)
        print('{} words dropped, remain {} words'.format(
            pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size()))
        ifd.dump_dict(post)
        print('dump over')
Пример #7
0
            self.ifd.load_dict(ifd_file)


# pre_dict_file = getcfg().pre_dict_file
post_dict_file = getcfg().post_dict_file
token_dict = IfdGetter(post_dict_file)

# pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file]
# post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file]
# prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list]


if __name__ == '__main__':
    import utils.pattern_utils as pu
    
    def word_remove(word, freq):
        if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10:
            return True
        return False
    
    pre2post = dict(zip(pre_list, post_list))
    for pre, post in pre2post.items():
        ifd = IdFreqDict()
        ifd.load_dict(pre)
        pre_vocab = ifd.vocabulary_size()
        print('{} loaded, {} words'.format(pre, pre_vocab))
        ifd.drop_words_by_condition(word_remove)
        print('{} words dropped, remain {} words'.format(pre_vocab - ifd.vocabulary_size(), ifd.vocabulary_size()))
        ifd.dump_dict(post)
        print('dump over')