Exemplo n.º 1
0
    def wd_seg(self):
        def _file_seg_(filename):
            _f = codecs.open(filename, 'r', 'utf-8')
            txt = _f.read()
            seg_list = [_wd for _wd in jieba.cut(txt)]
            # words account before delete stopwords
            _stopwd = len(seg_list)
            for sw in self.stop_wd_list:
                while sw.strip() in seg_list:
                    seg_list.remove(sw.strip())
            # words account before after stopwords
            stopwd_ = len(seg_list)
            _f.close()
            return seg_list, _stopwd, stopwd_
        # use get_sets_of_root_path_tree so
        for iter_filename in self.set_dict['files']:
            log_dir = '\\'.join(iter_filename.split('.')[0].split('\\')[:-3]) + '-seg\\' + \
                      '\\'.join(iter_filename.split('.')[0].split('\\')[-3:-1])

            if not os.path.exists(log_dir):
                os.makedirs(log_dir)

            log_name = iter_filename.split('.')[0].split('\\')[-1] + '.log'
            wd_list, _wds, wds_ = _file_seg_(iter_filename)
            Log.log_blue_running("Processed " + self.set_dict['category'] + ' - ' + iter_filename + ',' +
                            ' words account reduced from ' + str(_wds) + ' to ' + str(wds_))
            f = codecs.open(log_dir + '\\' + log_name, 'w', 'utf-8')
            for wd in wd_list:
                if wd != u' ' and wd != u'\n':
                    f.write(wd)
                    f.write(' ')
            f.close()
Exemplo n.º 2
0
    def text_classification_polynomial(self, test_category):
        logfile = open('tc_log.log', 'a')
        prior_pos = self._get_prior_possibility_()

        def p_possibility(p_pos_list):
            log_p_pos = map(math.log, p_pos_list)
            log_p = reduce(lambda x, y: x + y, log_p_pos)
            return log_p

        def classification(p_possibility_list, category):
            p = max([
                p_possibility_list[_iter_]
                for _iter_ in range(len(p_possibility_list))
            ]).values()[0]
            return p, p == category

        test_num = len(self._test_set_dict_[test_category])
        if_right = 0
        for _iter_file in self._test_set_dict_[test_category]:
            wd_list = self.get_wd_list(_iter_file)
            category_pos = []
            for _iter_category in range(len(self._wd_list_category_)):
                wd_account = reduce(
                    lambda x, y: x + y,
                    self._wd_list_category_[_iter_category]['words'].values())
                _p_category = []
                denominator = wd_account + len(self._wd_list_total_)
                for wd in wd_list.keys():
                    if wd in self._wd_list_category_[_iter_category]['words']:
                        nij = self._wd_list_category_[_iter_category]['words'][
                            wd]
                    else:
                        nij = 0
                    # every single word count for once
                    for _iter_account_ in range(wd_list[wd]):
                        _p = (nij + 1) * 1.0 / denominator
                        _p_category.append(_p)
                category_pos.append({
                    p_possibility(_p_category) + math.log(prior_pos[_iter_category]['prior_possibility']):
                    self._wd_list_category_[_iter_category]['category']
                })
            if classification(category_pos, test_category)[1]:
                if_right += 1
        Log.log_blue_running('Category: ' + test_category + ' total ' +
                             str(test_num) + ', correct: ' + str(if_right) +
                             ' Accuracy: ' +
                             str((if_right * 1.0 / test_num) * 100) + '%')
        logfile.write('Category: ' + test_category + ' total ' +
                      str(test_num) + ', correct: ' + str(if_right) +
                      ' Accuracy: ' + str((if_right * 1.0 / test_num) * 100) +
                      '%' + '\n')
        logfile.close()
Exemplo n.º 3
0
    def _bayes_train_(self):
        self._wd_list_category_ = []
        if os.path.exists(self._cache_path_total_):
            Log.log_blue_running('Corpus cache loaded.')
            self._wd_list_total_ = self.load_cache(self._cache_path_total_)
            for _iter_cache_file in range(len(self._cache_path_category_)):
                self._wd_list_category_.append({
                    'category':
                    self._cache_path_category_[_iter_cache_file]['category'],
                    'words':
                    self.load_cache(
                        self._cache_path_category_[_iter_cache_file]
                        ['cache_path'])
                })
        else:
            Log.log_blue_running('Naive Bayes train module.')
            wd_list_total = {}
            for _iter_basename in self._train_set_dict_:

                characteristic_wd_category = {}

                for _iter_filename in self._train_set_dict_[_iter_basename]:
                    tmp = TextClassification.get_wd_list(_iter_filename)
                    for wd in tmp:
                        if wd in wd_list_total:
                            wd_list_total[wd] += tmp[wd]
                        else:
                            wd_list_total[wd] = tmp[wd]
                    words_freq = OrderedDict(
                        (sorted((copy.deepcopy(tmp).items()),
                                key=lambda t: -t[-1]))).items()

                    if len(words_freq) > 10:
                        characteristic_wd_file = dict(
                            (x, y) for x, y in words_freq[:10])
                    else:
                        characteristic_wd_file = dict(
                            (x, y) for x, y in words_freq)

                    for wd in characteristic_wd_file:
                        if wd in characteristic_wd_category:
                            characteristic_wd_category[
                                wd] += characteristic_wd_file[wd]
                        else:
                            characteristic_wd_category[
                                wd] = characteristic_wd_file[wd]

                self.create_cache(
                    characteristic_wd_category,
                    self._train_set_dir_ + '\\' + _iter_basename + '.json')
                Log.log_blue_running('Category: ' + _iter_basename +
                                     ' cache created.')

            self._wd_list_total_ = wd_list_total
            self.create_cache(
                self._wd_list_total_,
                os.path.abspath(self._train_set_dir_ + '\\..') +
                '\\wdlist-total.json')
            Log.log_blue_running('Corpus word-lists cache created.')
Exemplo n.º 4
0
 def get_stop_wd_list():
     with codecs.open('stopwd.txt', 'r', 'utf-8') as f:
         text = f.readlines()
         stop_wd_list = [word for word in text]
     Log.log_blue_running('Stopwords total : ' + str(len(stop_wd_list)) + '\n')
     return stop_wd_list