예제 #1
0
def _gen_train_data():
    segmenter = Segmenter()
    poems = get_pop_quatrains()
    random.shuffle(poems)
    ranks = get_word_ranks()
    print("Generating training data ...")
    data = []
    kw_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            rows = []
            kw_row = []
            for sentence in sentences:
                rows.append([sentence])
                segs = list(filter(lambda seg: seg in ranks, segmenter.segment(sentence)))
                if 0 == len(segs):
                    flag = False
                    break
                keyword = reduce(lambda x,y: x if ranks[x] < ranks[y] else y, segs)
                kw_row.append(keyword)
                rows[-1].append(keyword)
            if flag:
                data.extend(rows)
                kw_data.append(kw_row)
        if 0 == (idx+1)%2000:
            print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems)))
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row)+'\n')
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row)+'\n')
    print("Training data is generated.")
 def _get_adjlists(self):
     print("[TextRank] Generating word graph ...")
     segmenter = Segmenter()
     poems = Poems()
     adjlists = dict()
     # Count number of co-occurrence.
     for poem in poems:
         for sentence in poem:
             words = []
             for word in segmenter.segment(sentence):
                 if word not in self.stopwords:
                     words.append(word)
             for word in words:
                 if word not in adjlists:
                     adjlists[word] = dict()
             for i in range(len(words)):
                 for j in range(i + 1, len(words)):
                     if words[j] not in adjlists[words[i]]:
                         adjlists[words[i]][words[j]] = 1.0
                     else:
                         adjlists[words[i]][words[j]] += 1.0
                     if words[i] not in adjlists[words[j]]:
                         adjlists[words[j]][words[i]] = 1.0
                     else:
                         adjlists[words[j]][words[i]] += 1.0
     # Normalize weights.
     for a in adjlists:
         sum_w = sum(w for _, w in adjlists[a].items())
         for b in adjlists[a]:
             adjlists[a][b] /= sum_w
     return adjlists
예제 #3
0
def _rank_all_words():
    segmenter = Segmenter()  #generation   sxhy dictp
    stopwords = get_stopwords()
    print("Start TextRank over the selected quatrains ...")
    quatrains = get_quatrains()
    adjlist = dict()
    for idx, poem in enumerate(quatrains):
        if 0 == (idx + 1) % 10000:
            print("[TextRank] Scanning %d/%d poems ..." %
                  (idx + 1, len(quatrains)))
        for sentence in poem['sentences']:
            segs = filter(lambda word: word not in stopwords,
                          segmenter.segment(sentence))  #分词结果
            for seg in segs:
                if seg not in adjlist:
                    adjlist[seg] = dict()
            for i, seg in enumerate(segs):
                for _, other in enumerate(segs[i + 1:]):
                    if seg != other:
                        adjlist[seg][other] = adjlist[seg][other]+1 \
                                if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg]+1 \
                                if seg in adjlist[other] else 1.0
    for word in adjlist:
        w_sum = sum(weight for other, weight in adjlist[word].items())  #权重增加
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum
    print("[TextRank] Weighted graph has been built.")
    _text_rank(adjlist)
예제 #4
0
def _rank_all_words():
    segmenter = Segmenter()  # 诗句分段器
    stopwords = get_stopwords()  # 停用词列表
    print "Start TextRank over the selected quatrains ..."
    quatrains = get_quatrains()  # 四行诗集合
    adjlist = dict()
    for idx, poem in enumerate(quatrains):  # 对于每首诗
        if 0 == (idx + 1) % 10000:
            print "[TextRank] Scanning %d/%d poems ..." % (idx + 1,
                                                           len(quatrains))
        for sentence in poem['sentences']:  # 对于每一句诗
            segs = filter(lambda word: word not in stopwords,
                          segmenter.segment(sentence))  # 得到不再停用词中的词段
            for seg in segs:  # 对于每个词段
                if seg not in adjlist:
                    adjlist[seg] = dict()  # 每个词段生成一个字典dict
            for i, seg in enumerate(segs):  # 对于每个词段
                for _, other in enumerate(
                        segs[i + 1:]):  # 去和后面的每个词段比较,实际是源于text_rank需要的网状结构图
                    if seg != other:  # 精巧的code
                        adjlist[seg][other] = adjlist[seg][other]+1 \
                                if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg]+1 \
                                if seg in adjlist[other] else 1.0
    for word in adjlist:
        w_sum = sum(
            weight
            for other, weight in adjlist[word].items())  # 求该word对应的所有词的权重综合
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum  # 求该word中每个value对应的权重平均值
    print "[TextRank] Weighted graph has been built."
    _text_rank(adjlist)
예제 #5
0
def _gen_train_data():
    sampled_poems = np.array(random_int_list(1, 70000, 4000))
    segmenter = Segmenter()  #generation   sxhy dict
    poems = get_pop_quatrains()  #获得较为流行的10万首诗
    random.shuffle(poems)  #重新排序
    ranks = get_word_ranks()  #Textrank  word  -rank_number
    print("Generating training data ...")
    data = []
    kw_data = []
    test_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            test_flag = True
            rows = []
            kw_row = []
            test_row = []
            if idx in sampled_poems:
                test_flag = False
            for sentence in sentences:
                rows.append([sentence])
                test_row.append([sentence])
                segs = list(
                    filter(lambda seg: seg in ranks,
                           segmenter.segment(sentence)))
                if 0 == len(segs):
                    flag = False
                    break
                keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y,
                                 segs)  #选取权重比较大的keywords
                kw_row.append(keyword)
                rows[-1].append(keyword)
            if flag and test_flag:
                data.extend(rows)
                kw_data.append(kw_row)
            if flag and test_flag is False:
                test_data.extend(test_row)

        if 0 == (idx + 1) % 2000:
            print("[Training Data] %d/%d poems are processed." %
                  (idx + 1, len(poems)))
    print(test_data)
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row) + '\n')
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row) + '\n')
    with codecs.open(test_path, 'w', 'utf-8') as fout:
        for test_row in test_data:
            fout.write('\t'.join(test_row) + '\n')
    print("Training data is generated.")
예제 #6
0
def gen_train_data():
    """获取每一句的keywords,拼起来写入文件"""
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    ranked_words = RankedWords()

    gen_data = list()
    plan_data = list()

    valid = True
    counter_line = 0
    print('len(poems)==>', len(poems))
    for poem in poems:
        # print(len(poem))
        if len(poem) != 4:
            # print(poem)
            valid = False
            continue
        context = start_of_sentence()
        keywords = list()
        for sentence in poem:
            counter_line += 1
            keyword = ''
            if len(sentence) != 7:
                valid = False
                break
            filterwords = list(
                filter(lambda x: x in ranked_words,
                       segmenter.segment(sentence)))
            if filterwords:
                keyword = filterwords[0]
            for word in filterwords:
                # print('word==>',word)
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            if keyword:
                gen_line = sentence + end_of_sentence() + \
                           '\t' + keyword + '\t' + context + '\n'
                keywords.append(keyword)
                gen_data.append(gen_line)
                context += sentence + end_of_sentence()
        plan_data.append(' '.join(keywords))
    with open(plan_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter + '\n')
    with open(gen_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter)

    print('counter_line==>', counter_line)
    del segmenter, poems, ranked_words
예제 #7
0
def _gen_word_cnts():
    counters = dict()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            segs = segmenter.segment(sentence)
            for seg in segs:
                counters[seg] = counters[seg]+1 if seg in counters else 1
        if 0 == (idx+1)%10000:
            print "[Word Count] %d/%d quatrains has been processed." %(idx+1, len(quatrains))
    with codecs.open(_wc_path, 'w', 'utf-8') as fout:
        json.dump(counters, fout)
예제 #8
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains.
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                #只考虑七字诀句
                valid = False
                break
            #get a list of selected words from this sentence
            #ignore all words if they are not in the ranked words list
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]

            # from all words in this sentence, get the word with highest text_rank score
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word

            gen_line = sentence + end_of_sentence() + \
                       '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            # plan data: each line is four keywords from the 4 sentences
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
예제 #9
0
 def _train(self):
     print("Start training Word2Vec for planner ...")
     quatrains = get_quatrains()
     segmenter = Segmenter()
     seg_lists = []
     for idx, quatrain in enumerate(quatrains):
         seg_list = []
         for sentence in quatrain['sentences']:
             seg_list.extend([seg for seg in segmenter.segment(sentence) if seg in self.ranks])
         seg_lists.append(seg_list)
         if 0 == (idx+1)%10000:
             print("[Plan Word2Vec] %d/%d quatrains has been processed." %(idx+1, len(quatrains)))
     print("Hold on. This may take some time ...")
     self.model = models.Word2Vec(seg_lists, size = 512, min_count = 5)
     self.model.save(_model_path)
예제 #10
0
 def setImage(self):
     filePath, _ = QtWidgets.QFileDialog.getOpenFileName(
         None, "Select Image", "", "Image Files (*.png *.jpg *.jpeg *.bmp)")
     if filePath:
         pixmap = QtGui.QPixmap(filePath)
         pixmap = pixmap.scaled(self.imageLabel.width(),
                                self.imageLabel.height(),
                                QtCore.Qt.KeepAspectRatio)
         self.imageLabel.setPixmap(pixmap)
         self.imageLabel.setAlignment(QtCore.Qt.AlignCenter)
         self.thresholdInc.setEnabled(True)
         self.thresholdDec.setEnabled(True)
         self.thresholdVal.setEnabled(True)
         self.autoSegmentBtn.setEnabled(True)
         self.selectImageBtn.setEnabled(False)
     self.filePath = filePath
     self.segmenter = Segmenter(filePath)
예제 #11
0
def get_pop_quatrains(num = 100000):
    cnts = get_word_cnts()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \
            for i, quatrain in enumerate(quatrains)]
    indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i])
    return [quatrains[index] for index in indexes[:min(num, len(indexes))]]
예제 #12
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        # 只处理四行七言的诗
        if len(poem) != 4:
            continue
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                valid = False
                break
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            gen_line = sentence + end_of_sentence() + \
                    '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
예제 #13
0
 def _get_adjlists(self):
     poems = Poems()
     segmenter = Segmenter()
     adjlists = collections.defaultdict(dict)
     for poem_set in poems:
         for poem in poem_set:
             words = segmenter.segment(poem)
             for i in range(len(words) - 1):
                 for j in range(i + 1, len(words)):
                     if words[j] not in adjlists[words[i]]:
                         adjlists[words[i]][words[j]] = 1.0
                     else:
                         adjlists[words[i]][words[j]] += 1.0
                     if words[i] not in adjlists[words[j]]:
                         adjlists[words[j]][words[i]] = 1.0
                     else:
                         adjlists[words[j]][words[i]] += 1.0
     return adjlists
예제 #14
0
 def _train(self):
     print "Start training Word2Vec for planner ..."
     quatrains = get_quatrains()
     segmenter = Segmenter()  # 对诗句分段和取其中的每个词不一样
     seg_lists = []
     for idx, quatrain in enumerate(quatrains):
         seg_list = []
         for sentence in quatrain['sentences']:
             seg_list.extend(
                 filter(lambda seg: seg in self.ranks,
                        segmenter.segment(sentence)))
         seg_lists.append(seg_list)
         if 0 == (idx + 1) % 10000:
             print "[Plan Word2Vec] %d/%d quatrains has been processed." % (
                 idx + 1, len(quatrains))
     print "Hold on. This may take some time ..."
     self.model = models.Word2Vec(seg_lists, size=512,
                                  min_count=5)  # 代表一个词向量类,生成的是词向量模型
     self.model.save(_model_path)
예제 #15
0
    def _build_adjlists_from_tencent_embeddings(self):
        print("[TextRank] Generating word graph ...")
        segmenter = Segmenter()
        poems = Poems()
        adjlists = dict(
        )  # 2D dict, dict[word1][word2]=prob(going from word1 to word2)
        wv = get_tencent_embedding_keyedVectors(_tencent_embedding_path)

        # Count number of co-occurrence.

        ######################## get a 2D cos sim matrix for all words ###################
        words = set()
        for poem in poems:
            for sentence in poem:
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.add(word)
        for word in words:
            if word not in adjlists:
                #initialize all words to a new dict()
                adjlists[word] = dict()

        for word in words:
            for other in words:

                if word == other:
                    continue

                if other in adjlists[word] or word in adjlists[other]:
                    continue

                sim = wv.similarity(word, other)
                adjlists[word][other] = sim
                adjlists[other][word] = sim

        # Normalize weights.
        for a in adjlists:
            sum_w = sum(w for _, w in adjlists[a].items())
            for b in adjlists[a]:
                adjlists[a][b] /= sum_w
        return adjlists
예제 #16
0
def _gen_train_data():
    segmenter = Segmenter()
    poems = get_pop_quatrains()
    random.shuffle(poems)
    ranks = get_word_ranks()
    print "Generating training data ..."
    data = []
    kw_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            lines = u''
            rows = []
            kw_row = []
            for sentence in sentences:
                rows.append([sentence])
                segs = filter(lambda seg: seg in ranks,
                              segmenter.segment(sentence))
                if 0 == len(segs):  # 只要该行诗句存在不在ranks中的词则这一首诗都不能用
                    flag = False
                    break
                keyword = reduce(lambda x, y: x
                                 if ranks[x] < ranks[y] else y, segs)
                kw_row.append(keyword)
                rows[-1].append(keyword)  # rows的每一个元素是该行诗句加上对应的关键字数组
            if flag:
                data.extend(rows)  # 用extend,data的每一个元素和rows的每一个元素相同
                kw_data.append(kw_row)  # 用append
        if 0 == (idx + 1) % 2000:
            print "[Training Data] %d/%d poems are processed." % (idx + 1,
                                                                  len(poems))
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row) + '\n')  # 每一行都是用tab键分隔开的一行诗加上关键字序列
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row) + '\n')
    print "Training data is generated."
예제 #17
0
    def _do_text_rank(self):
        print("Do text ranking ...")
        adjlists = self._get_adjlists()
        #adjlists = self._build_adjlists_from_tencent_embeddings()
        print("[TextRank] Total words: %d" % len(adjlists))

        # Value initialization.
        scores = dict()
        for word in adjlists:
            #score[0] is previous score, score[1] is new score
            scores[word] = [1.0, 1.0]

        # Synchronous value iterations.
        itr = 0
        #### train text rank here #####
        while True:
            sys.stdout.write("[TextRank] Iteration %d ..." % itr)
            sys.stdout.flush()
            for word, adjlist in adjlists.items():
                scores[word][1] = (1.0 - _damp) + _damp * \
                        sum(adjlists[other][word] * scores[other][0]
                                for other in adjlist)

            #eps is the difference between new score and previous score, used to check for convergence
            eps = 0
            for word in scores:
                eps = max(eps, abs(scores[word][0] - scores[word][1]))
                scores[word][0] = scores[word][1]
            print(" eps = %f" % eps)
            # if eps <= 1e-6:
            #     break
            #if itr == 200:  # train for only 200 iteration ###########################
            if itr == NUM_Of_ITERATIONS:
                break
            itr += 1

        # Dictionary-based comparison with TextRank score as a tie-breaker.
        segmenter = Segmenter()

        def cmp_key(x):
            word, score = x
            return (0 if word in segmenter.sxhy_dict else 1, -score)

        words = sorted([(word, score[0]) for word, score in scores.items()],
                       key=cmp_key)

        # Store ranked words and scores.
        with open(wordrank_path, 'w') as fout:
            json.dump(words, fout)
    def _do_text_rank(self):
        print("Do text ranking ...")
        adjlists = self._get_adjlists()
        print("[TextRank] Total words: %d" % len(adjlists))

        # Value initialization.
        scores = dict()
        for word in adjlists:
            scores[word] = [1.0, 1.0]

        # Synchronous value iterations.
        itr = 0
        while True:
            sys.stdout.write("[TextRank] Iteration %d ..." % itr)
            sys.stdout.flush()
            for word, adjlist in adjlists.items():
                scores[word][1] = (1.0 - _damp) + _damp * \
                        sum(adjlists[other][word] * scores[other][0]
                                for other in adjlist)
            eps = 0
            for word in scores:
                eps = max(eps, abs(scores[word][0] - scores[word][1]))
                scores[word][0] = scores[word][1]
            print(" eps = %f" % eps)
            if eps <= 1e-6:
                break
            itr += 1

        # Dictionary-based comparison with TextRank score as a tie-breaker.
        segmenter = Segmenter()

        def cmp_key(x):
            word, score = x
            return (0 if word in segmenter.sxhy_dict else 1, -score)

        words = sorted([(word, score[0]) for word, score in scores.items()],
                       key=cmp_key)

        # Store ranked words and scores.
        with open(wordrank_path, 'w') as fout:
            json.dump(words, fout)
    def __init__(self, punct_file=None,
                 stop_file=None,
                 once_file=None,
                 reserve_file=None,
                 area_file=None,
                 color_file=None,
                 quantifier_file=None,
                 num_file=None):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        if not punct_file:
            punct_file = cur_dir + '/dict/punct.txt'
        if not stop_file:
            stop_file = cur_dir + '/dict/stop_words.txt'
        if not once_file:
            once_file = cur_dir + '/dict/once.words'
        if not reserve_file:
            reserve_file = cur_dir + '/dict/reserve_words.txt'
     
        self.segmenter = Segmenter()
        self.punct = set()
        self.load_punct = (punct_file)
 
        self.stop_words = set()
        self.load_stop_words(stop_file)
 
        self.remove_words = set()
        self.load_remove_words(once_file)
 
        self.reserve_words = set()
        self.load_reserve_words(reserve_file)

        self.replace_lst = [(u'斜跨包', u'斜挎包'), (u'!', u','), (u'。', u','), (u',', u','),
                (u'市场价', u''), (u'全国包邮', u''), (u'包邮', u''), (u'【', u''),
                (u'】', u''), (u'[', u''), (u']', u''), (u'《', u''), (u'》', u'')]

        self.word_label = WordLabel(area_file=area_file,
                                    color_file=None, quantifier_file=None, num_file=None)
예제 #20
0
    def _do_text_rank(self):
        """scores,给所有词设置 双score    每个句子进行词语之间的组合, 迭代词语分数   给分数排序"""
        print("Do text ranking ...")
        segment = Segmenter()
        scores = dict()
        adjlists = self._get_adjlists()
        for word in adjlists:
            scores[word] = [1.0, 1.0]

        for word, adjust in adjlists.items():
            sums = sum([w for _, w in adjust.items()])
            for word, weight in adjust.items():
                adjust[word] = weight / sums
        _damp = 0.85
        while True:
            for word, adjust in adjlists.items():
                scores[word][1] = (1 - _damp) + _damp * sum([
                    scores[word][0] * adjlists[other][word] for other in adjust
                ])
            eps = 0.0
            for word in scores:
                eps = max(eps, scores[word][0] - scores[word][1])
                scores[word][0] = scores[word][1]
            print('eps=>', eps)
            if eps < 0.05:
                break

        def tmp_key(x):
            word, score = x
            return 0 if word in segment.sxhy_dict else -1, -score

        word_and_scores = sorted([(word, score[0])
                                  for word, score in scores.items()],
                                 key=tmp_key)
        with open(wordrank_path, 'w') as fw:
            json.dump(word_and_scores, fw)
        return scores
예제 #21
0
 def generate_segmentation(self):  
     self.segmentation = Segmenter(self.line_info)
class WordFeature(object):
    def __init__(self, punct_file=None,
                 stop_file=None,
                 once_file=None,
                 reserve_file=None,
                 area_file=None,
                 color_file=None,
                 quantifier_file=None,
                 num_file=None):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        if not punct_file:
            punct_file = cur_dir + '/dict/punct.txt'
        if not stop_file:
            stop_file = cur_dir + '/dict/stop_words.txt'
        if not once_file:
            once_file = cur_dir + '/dict/once.words'
        if not reserve_file:
            reserve_file = cur_dir + '/dict/reserve_words.txt'
     
        self.segmenter = Segmenter()
        self.punct = set()
        self.load_punct = (punct_file)
 
        self.stop_words = set()
        self.load_stop_words(stop_file)
 
        self.remove_words = set()
        self.load_remove_words(once_file)
 
        self.reserve_words = set()
        self.load_reserve_words(reserve_file)

        self.replace_lst = [(u'斜跨包', u'斜挎包'), (u'!', u','), (u'。', u','), (u',', u','),
                (u'市场价', u''), (u'全国包邮', u''), (u'包邮', u''), (u'【', u''),
                (u'】', u''), (u'[', u''), (u']', u''), (u'《', u''), (u'》', u'')]

        self.word_label = WordLabel(area_file=area_file,
                                    color_file=None, quantifier_file=None, num_file=None)

    def _add_char_to_set(self, myset, filename):
        with open(filename, 'r') as f:
            lines = f.readlines()
            for l in lines:
                lines = l.rstrip('\n').decode('utf-8')
                for c in lines:
                    myset.add(c)
    
    def load_punct(self, filename):
        self._add_char_to_set(self.punct, filename)

    def load_stop_words(self, filename):
        with open(filename, 'r') as f:
            for line in f:
                self.stop_words.add(line.rstrip('\n').decode('utf-8'))

    def load_remove_words(self, filename): 
        with open(filename, 'r') as f:
             for line in f:
                 self.remove_words.add(line.rstrip('\n').decode('utf-8'))

    def load_reserve_words(self, filename):
        with open(filename, 'r') as f:
             for line in f:
                 self.reserve_words.add(line.rstrip('\n').decode('utf-8').lower()) 

    def check_is_mode(self, word):
        has_hyphen = False
        for c in word:
            if c == u'-':
                has_hyphen = True
            if (c < u'a' and c > u'z') and (c < u'0' and c > u'9'):
                return False
        return has_hyphen
    
    def check_valid_new(self, word):
        if word in self.reserve_words:
            return True
        if not word:
            return False
        if word.isnumeric():
            return False
        # unicode 编码无法使用 isalnum()
        if word.encode("u8").isalnum() and len(word) <= 3:
            return False
    #    if len(word) == 1 and ord(word) < 256:
        if len(word) == 1:
            return False
        if word in self.punct:
            return False
        if word in self.stop_words:
            return False
        if word in self.remove_words:
            return False
        if self.check_is_mode(word):
            return False
        try:
            float(word)
            return False
        except:
            pass
        return True
    

    def check_valid(self, word):
        if not word:
            return False
        if word.isnumeric():
            return False
        if word in self.punct:
            return False
        if len(word) == 1 and ord(word) < 256:
            return False
        if word[0].isdigit():
            return False
        if word in self.stop_words:
            return False
        if word in self.remove_words:
            return False
        if self.check_is_mode(word):
            return False
        return True  

    def convert_word_features(self, text):
        words = self.segmenter.segment(text.lower().strip())
        features = {}

        word0 = ""
        for word in words:
            word = word.strip().replace(u'(', u'').replace(u')', u'').replace(u'(', u'').replace(u')', u'')
            if not word:
                continue
            word = self.word_label.word_label(word, word0)
            word0 = word
            if not self.check_valid(word):
                continue
            features[word] = 1
        return features

    def convert_all(self, cid, name, cat, brand, price):
        remove_cat_count = 0
        try:
            config = zk_conf.get_client(cid)
            if config and "category_remove" in config:
                remove_cat_count = config["category_remove"]
        except Exception, e:
            logging.error("category_remove: %s", e)

        try:
            cat= json.dumps(json.loads(cat)[remove_cat_count:], separators=(',',':'), ensure_ascii=False)
        except:
            cat = u'[]'
        if brand.endswith(u'公司'):
            brand = u''
        name = self.extract_sentence(name)
        sample = self.convert_features_with_all(name, cat, brand, price)
        return (cid, name, cat, brand, price, sample)
예제 #23
0
def test():
    s = Segmenter('test.png')

    s.threshold_and_morph(11)

    s.auto_segment()
예제 #24
0
파일: demo.py 프로젝트: jason2506/mmseg.py
# -*- coding: utf-8 -*-

from codecs import open
from itertools import imap
from math import log

from lexicon import Lexicon
from segment import Segmenter


def wrap(line):
    w, f = line.strip().split(" ")
    f = log(float(f) + 1.0)
    return (w, f)


with open("dict.txt", "r", "utf-8") as fin:
    tf = dict(imap(wrap, fin))
    lex = Lexicon(tf)
    seg = Segmenter(lex)
    result = seg.segment(u"這是一隻可愛的小花貓")
    print "/".join(result).encode("utf-8")
예제 #25
0
class Ui_MainWindow(object):
    def setupUi(self, MainWindow):

        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(570, 351)

        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")

        self.selectImageBtn = QtWidgets.QPushButton(self.centralwidget)
        self.selectImageBtn.setGeometry(QtCore.QRect(30, 300, 93, 28))
        self.selectImageBtn.setObjectName("selectImageBtn")

        self.imageLabel = QtWidgets.QLabel(self.centralwidget)
        self.imageLabel.setGeometry(QtCore.QRect(20, 10, 531, 261))
        self.imageLabel.setFrameShape(QtWidgets.QFrame.Box)
        self.imageLabel.setText("")
        self.imageLabel.setObjectName("imageLabel")

        self.submitBtn = QtWidgets.QPushButton(self.centralwidget)
        self.submitBtn.setGeometry(QtCore.QRect(460, 300, 93, 28))
        self.submitBtn.setObjectName("submitBtn")

        self.thresholdDec = QtWidgets.QPushButton(self.centralwidget)
        self.thresholdDec.setGeometry(QtCore.QRect(150, 300, 31, 28))
        self.thresholdDec.setObjectName("thresholdDec")

        self.thresholdInc = QtWidgets.QPushButton(self.centralwidget)
        self.thresholdInc.setGeometry(QtCore.QRect(250, 300, 31, 28))
        self.thresholdInc.setObjectName("thresholdInc")

        self.thresholdVal = QtWidgets.QLineEdit(self.centralwidget)
        self.thresholdVal.setGeometry(QtCore.QRect(190, 300, 51, 31))
        self.thresholdVal.setObjectName("thresholdVal")
        self.thresholdVal.isReadOnly()
        self.thresholdVal.setAlignment(QtCore.Qt.AlignCenter)
        self.thresholdVal.setReadOnly(True)
        self.thresholdVal.setText("5")

        self.autoSegmentBtn = QtWidgets.QPushButton(self.centralwidget)
        self.autoSegmentBtn.setGeometry(QtCore.QRect(320, 300, 111, 28))
        self.autoSegmentBtn.setObjectName("autoSegmentBtn")

        MainWindow.setCentralWidget(self.centralwidget)

        self.filePath = ""
        #setting buttons to be disabled at the beginning
        self.submitBtn.setEnabled(False)
        self.thresholdDec.setEnabled(False)
        self.thresholdInc.setEnabled(False)
        self.thresholdVal.setEnabled(False)
        self.autoSegmentBtn.setEnabled(False)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

        self.selectImageBtn.clicked.connect(self.setImage)
        self.thresholdInc.clicked.connect(self.increaseThreshold)
        self.thresholdDec.clicked.connect(self.decreaseThreshold)
        self.autoSegmentBtn.clicked.connect(self.autoSegment)
        #self.submitBtn.clicked.connect(self.uploadImage)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.selectImageBtn.setText(_translate("MainWindow", "Select Image"))
        self.submitBtn.setText(_translate("MainWindow", "Submit"))
        self.thresholdDec.setText(_translate("MainWindow", "▼"))
        self.thresholdInc.setText(_translate("MainWindow", "▲"))
        self.autoSegmentBtn.setText(_translate("MainWindow", "Auto-Segment"))

    def setImage(self):
        filePath, _ = QtWidgets.QFileDialog.getOpenFileName(
            None, "Select Image", "", "Image Files (*.png *.jpg *.jpeg *.bmp)")
        if filePath:
            pixmap = QtGui.QPixmap(filePath)
            pixmap = pixmap.scaled(self.imageLabel.width(),
                                   self.imageLabel.height(),
                                   QtCore.Qt.KeepAspectRatio)
            self.imageLabel.setPixmap(pixmap)
            self.imageLabel.setAlignment(QtCore.Qt.AlignCenter)
            self.thresholdInc.setEnabled(True)
            self.thresholdDec.setEnabled(True)
            self.thresholdVal.setEnabled(True)
            self.autoSegmentBtn.setEnabled(True)
            self.selectImageBtn.setEnabled(False)
        self.filePath = filePath
        self.segmenter = Segmenter(filePath)

    def uploadImage(self):
        pass

    def increaseThreshold(self):
        val = (int)(self.thresholdVal.text())
        self.thresholdVal.setText(str(val + 1))
        val = val + 1
        self.segmenter.threshold_and_morph(val)

    def decreaseThreshold(self):
        val = (int)(self.thresholdVal.text())
        if val == 1:
            return
        self.thresholdVal.setText(str(val - 1))
        val = val - 1
        self.segmenter.threshold_and_morph(val)

    def autoSegment(self):
        self.segmenter.auto_segment((int)(self.thresholdVal.text()))
        self.showdialog()

    def showdialog(self):
        msg = QtWidgets.QMessageBox()
        msg.about(self.centralwidget, "Done", "Success!")
예제 #26
0
# -*- coding: utf-8 -*-

from codecs import open
from itertools import imap
from math import log

from lexicon import Lexicon
from segment import Segmenter


def wrap(line):
    w, f = line.strip().split(' ')
    f = log(float(f) + 1.0)
    return (w, f)


with open('dict.txt', 'r', 'utf-8') as fin:
    tf = dict(imap(wrap, fin))
    lex = Lexicon(tf)
    seg = Segmenter(lex)
    result = seg.segment(u'這是一隻可愛的小花貓')
    print('/'.join(result).encode('utf-8'))
예제 #27
0
    def _get_adjlists(self):
        print("[TextRank] Generating word graph ...")
        segmenter = Segmenter()
        poems = Poems()
        adjlists = dict(
        )  # 2D dict, dict[word1][word2]=prob(going from word1 to word2)
        # Count number of co-occurrence.
        """
        ######################## count relationship per sentence ###################
        for poem in poems:
            for sentence in poem:
                words = []
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.append(word)
                for word in words:
                    if word not in adjlists:
                        #initialize all words to a new dict()
                        adjlists[word] = dict()
                for i in range(len(words)):
                    for j in range(i + 1, len(words)):
                        #### if two words present in the same sentence, their score +=1 #####
                        if words[j] not in adjlists[words[i]]:
                            adjlists[words[i]][words[j]] = 1.0
                        else:
                            adjlists[words[i]][words[j]] += 1.0
                        if words[i] not in adjlists[words[j]]:
                            adjlists[words[j]][words[i]] = 1.0
                        else:
                            adjlists[words[j]][words[i]] += 1.0

        ######################## end count relationship per sentence ###################
        """

        ######################## count relationship per poem ###################
        for poem in poems:
            for sentence in poem:
                words = []
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.append(word)
            for word in words:
                if word not in adjlists:
                    #initialize all words to a new dict()
                    adjlists[word] = dict()
            for i in range(len(words)):
                for j in range(i + 1, len(words)):
                    #### if two words present in the same sentence, their score +=1 #####
                    if words[j] not in adjlists[words[i]]:
                        adjlists[words[i]][words[j]] = 1.0
                    else:
                        adjlists[words[i]][words[j]] += 1.0
                    if words[i] not in adjlists[words[j]]:
                        adjlists[words[j]][words[i]] = 1.0
                    else:
                        adjlists[words[j]][words[i]] += 1.0

        ######################## end count relationship per poem ###################

        # Normalize weights.
        for a in adjlists:
            sum_w = sum(w for _, w in adjlists[a].items())
            for b in adjlists[a]:
                adjlists[a][b] /= sum_w
        return adjlists
예제 #28
0
def main(page_array, conf=Config(viterbi_postprocess=False, line_break_method = None, page_type = None), retries=0,
         text=False, page_info={}):
    '''Main procedure for processing a page from start to finish
    
    Parameters:
    --------------------
    page_array: a 2 dimensional numpy array containing binary pixel data of 
        the image
    
    page_info: dictionary, optional
        A dictionary containing metadata about the page to be recognized.
        Define strings for the keywords "flname" and "volume" if saving
        a serialized copy of the OCR results. 

    retries: Used internally when system attempts to reboot a failed attempt
    
    text: boolean flag. If true, return text rather than char-position data
    
    Returns:
    --------------
    text: str
        Recognized text for entire page
        
    if text=False, return character position and label data as a python dictionary
    '''
    
    print page_info.get('flname','')
    
    confpath = conf.path
    conf = conf.conf
    
    line_break_method = conf['line_break_method']
    page_type = conf['page_type']

    ### Set the line_break method automatically if it hasn't been
    ### specified beforehand
    if not line_break_method and not page_type:
        if page_array.shape[1] > 2*page_array.shape[0]:
            print 'setting page type as pecha'
            line_break_method = 'line_cluster'
            page_type = 'pecha'
        else: 
            print 'setting page type as book'
            line_break_method = 'line_cut'
            page_type = 'book' 
            
    conf['page_type'] = page_type
    conf['line_break_method'] = line_break_method
    detect_o = conf.get('detect_o', False)
    print 'clear hr', conf.get('clear_hr', False)

    results = []
    out = u''
    try:
        ### Get information about the pages
        shapes = PE2(page_array, cls, page_type=page_type, 
                     low_ink=conf['low_ink'], 
                     flpath=page_info.get('flname',''),
                     detect_o=detect_o, 
                     clear_hr =  conf.get('clear_hr', False))
        shapes.conf = conf
        
        ### Separate the lines on a page
        if page_type == 'pecha':
            k_groups = shapes.num_lines
        shapes.viterbi_post = conf['viterbi_postprocess']
        
        if line_break_method == 'line_cut':
            line_info = LineCut(shapes)
            if not line_info: # immediately skip to re-run with LineCluster
                sys.exit()
        elif line_break_method == 'line_cluster':
            line_info = LineCluster(shapes, k=k_groups)
        
        
        ### Perform segmentation of characters
        segmentation = Segmenter(line_info)

        ###Perform recognition
        if not conf['viterbi_postprocess']:
            if conf['recognizer'] == 'probout':
                results = recognize_chars_probout(segmentation)
            elif conf['recognizer'] == 'hmm':
                results = recognize_chars_hmm(segmentation, trans_p, start_p)
            elif conf['recognizer'] == 'kama':
                results = recognize_chars_probout(segmentation)
                results = recognize_chars_kama(results, segmentation)
            if conf['postprocess']:
                results = viterbi_post_process(segmentation.line_info.shapes.img_arr, results)
        else: # Should only be call from *within* a non viterbi run...

            prob, results = hmm_recognize_bigram(segmentation)
            return prob, results
        
        
        ### Construct an output string
        output  = []
        for n, line in enumerate(results):
            for m,k in enumerate(line):
#                 if isinstance(k[-1], int):
#                     print n,m,k
#                     page_array[k[1]:k[1]+k[3], k[0]:k[0]+k[2]] = 0
#                     Image.fromarray(page_array*255).show()
                    
                output.append(k[-1])
            output.append(u'\n')

        out =  ''.join(output)
        print out
    
        if text:
            results = out
        
        return results
    except:
        ### Retry and assume the error was cause by use of the
        ### wrong line_break_method...
        import traceback;traceback.print_exc()
        if not results and not conf['viterbi_postprocess']:
            print 'WARNING', '*'*40
            print page_info['flname'], 'failed to return a result.'
            print 'WARNING', '*'*40
            print
            if line_break_method == 'line_cut' and retries < 1:
                print 'retrying with line_cluster instead of line_cut'
                try:
                    return main(page_array, conf=Config(path=confpath, line_break_method='line_cluster', page_type='pecha'), page_info=page_info, retries = 1, text=text)
                except:
                    logging.info('Exited after failure of second run.')
                    return []
        if not conf['viterbi_postprocess']: 
            if not results:
                logging.info('***** No OCR output for %s *****' % page_info['flname'])
            return results