Пример #1
0
 def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True):
     #if emoticon and 'emoticon' not in self._ngrams_config:
     #    self._ngrams_config.append('emoticon')
     #elif not emoticon and 'emoticon' in self._ngrams_config:
     #    self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config)
     #grams = self._retrieve_feature(txt)
     grams = ST.retrieve_feature(txt,
                                 feature_extract_config=self._ngrams_config,
                                 gram_icon_mixed=emoticon)
     if debug:
         linfo('begin debug case: %s' % txt)
     tag2score = {"P": 0, "N": 0, "O": 0}
     for w in grams:
         for tag in tag2score:
             if not train_t2c[tag]:
                 continue
             score = self._cal_likelihood(train_w2c[tag].get(w, 0),
                                          train_t2c[tag])
             tag2score[tag] += score
             if debug:
                 linfo(
                     'DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s'
                     % (w, tag, score, train_w2c[tag].get(
                         w, 0), train_t2c[tag]))
     pred_tag = sorted(tag2score.keys(),
                       key=lambda x: tag2score[x],
                       reverse=True)[0]
     if debug:
         linfo('predict tag2score: %s' % tag2score)
     return pred_tag
Пример #2
0
 def get_feature(self, txt, cache=False):
     if txt in self.txt2bags:
         bags = self.txt2bags[txt]
     else:
         bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config)
         if cache:
             self.txt2bags[txt] = bags
     return bags
Пример #3
0
 def _feature_encoding(self, txt):
     bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config)
     #fs = {x:0 for x in gram2gid}
     fs = {}
     for gram in bags:
         if gram in self.gram2gid:
             fs[self.gram2gid[gram]] = 1
     return fs
Пример #4
0
 def _discretize_gram2gid(self):
     w2id = {} 
     for txt in self._train_xs:
         bags = ST.retrieve_feature(txt, feature_extract_config=self._feature_extract_config)
         for w in bags:
             if w not in w2id:
                 w2id[w] = len(w2id) + 1
     linfo('grams cnt: %s' % len(w2id))
     return w2id
Пример #5
0
 def get_feature(self, txt, cache=False):
     if txt in self.txt2bags:
         bags = self.txt2bags[txt]
     else:
         bags = ST.retrieve_feature(
             txt, feature_extract_config=self._feature_extract_config)
         if cache:
             self.txt2bags[txt] = bags
     return bags
Пример #6
0
 def discret_txt(self, txt):
     fs = [0 for x in range(len(self.gram2gid))]
     bags = ST.retrieve_feature(
         txt, feature_extract_config=self._feature_extract_config)
     for w in bags:
         if w in self.gram2gid:
             wid = self.gram2gid[w]
             fs[wid] = 1
     return fs
Пример #7
0
 def _cal_shard2info(self, shard_indexs):
     #word2cnt = BayesClassifier.Word2Cnt()
     word2presence = BayesClassifier.Word2Cnt() 
     #word_total_cnt = 0
     tag2cnt = {"P":0,"N":0,"O":0}
     for index in shard_indexs:
         #word_total_cnt += len(x)
         txt = self._train_xs[index] 
         tag = self._train_ys[index]
         tag2cnt[tag] += 1
         bags = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config)
         for w in bags:
             word2presence[tag].setdefault(w, 0)
             word2presence[tag][w] += 1
             #word2cnt[tag].setdefault(w, 0)
             #word2cnt[tag][w] += 1
         
     return tag2cnt, word2presence
Пример #8
0
    def _cal_shard2info(self, shard_indexs):
        #word2cnt = BayesClassifier.Word2Cnt()
        word2presence = BayesClassifier.Word2Cnt()
        #word_total_cnt = 0
        tag2cnt = {"P": 0, "N": 0, "O": 0}
        for index in shard_indexs:
            #word_total_cnt += len(x)
            txt = self._train_xs[index]
            tag = self._train_ys[index]
            tag2cnt[tag] += 1
            bags = ST.retrieve_feature(
                txt, feature_extract_config=self._ngrams_config)
            for w in bags:
                word2presence[tag].setdefault(w, 0)
                word2presence[tag][w] += 1
                #word2cnt[tag].setdefault(w, 0)
                #word2cnt[tag][w] += 1

        return tag2cnt, word2presence
Пример #9
0
    def build_sparse_X(self, _xs):
        row_num = len(_xs)
        col_num = len(self.gram2gid)

        rows, cols = [], []
        total_cnt = 0
        for i, txt in enumerate(_xs):
            bags = ST.retrieve_feature(
                txt, feature_extract_config=self._feature_extract_config)
            for w in bags:
                if w in self.gram2gid:
                    wid = self.gram2gid[w]
                    rows.append(i)
                    cols.append(wid)
                    total_cnt += 1
        linfo('build scipy sparse matrice. total_valid_cnt: %s' % (total_cnt))
        row = np.array(rows)
        col = np.array(cols)
        data = np.array([1 for i in range(total_cnt)])
        mtx = sparse.csr_matrix((data, (row, col)), shape=(row_num, col_num))
        return mtx
Пример #10
0
 def _predict(self, txt, train_w2c, train_t2c, debug=False, emoticon=True):
     #if emoticon and 'emoticon' not in self._ngrams_config:
     #    self._ngrams_config.append('emoticon')
     #elif not emoticon and 'emoticon' in self._ngrams_config:
     #    self._ngrams_config = filter(lambda x: x != 'emoticon', self._ngrams_config)
     #grams = self._retrieve_feature(txt)
     grams = ST.retrieve_feature(txt, feature_extract_config=self._ngrams_config, gram_icon_mixed=emoticon)
     if debug:
         linfo('begin debug case: %s' % txt)
     tag2score = {"P":0,"N":0,"O":0}
     for w in grams:
         for tag in tag2score:
             if not train_t2c[tag]:
                 continue
             score = self._cal_likelihood(train_w2c[tag].get(w, 0), train_t2c[tag]) 
             tag2score[tag] += score
             if debug:
                 linfo('DEBUG probability for gram %s when given tag %s is: %.4f. gram cnt: %s.tag cnt: %s' % (w, tag, score, train_w2c[tag].get(w, 0), train_t2c[tag]))
     pred_tag = sorted(tag2score.keys(), key=lambda x: tag2score[x], reverse=True)[0]
     if debug: 
         linfo('predict tag2score: %s' % tag2score)
     return pred_tag