예제 #1
0
def _gen_train_data():
    segmenter = Segmenter()
    poems = get_pop_quatrains()
    random.shuffle(poems)
    ranks = get_word_ranks()
    print("Generating training data ...")
    data = []
    kw_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            rows = []
            kw_row = []
            for sentence in sentences:
                rows.append([sentence])
                segs = list(filter(lambda seg: seg in ranks, segmenter.segment(sentence)))
                if 0 == len(segs):
                    flag = False
                    break
                keyword = reduce(lambda x,y: x if ranks[x] < ranks[y] else y, segs)
                kw_row.append(keyword)
                rows[-1].append(keyword)
            if flag:
                data.extend(rows)
                kw_data.append(kw_row)
        if 0 == (idx+1)%2000:
            print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems)))
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row)+'\n')
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row)+'\n')
    print("Training data is generated.")
 def _get_adjlists(self):
     print("[TextRank] Generating word graph ...")
     segmenter = Segmenter()
     poems = Poems()
     adjlists = dict()
     # Count number of co-occurrence.
     for poem in poems:
         for sentence in poem:
             words = []
             for word in segmenter.segment(sentence):
                 if word not in self.stopwords:
                     words.append(word)
             for word in words:
                 if word not in adjlists:
                     adjlists[word] = dict()
             for i in range(len(words)):
                 for j in range(i + 1, len(words)):
                     if words[j] not in adjlists[words[i]]:
                         adjlists[words[i]][words[j]] = 1.0
                     else:
                         adjlists[words[i]][words[j]] += 1.0
                     if words[i] not in adjlists[words[j]]:
                         adjlists[words[j]][words[i]] = 1.0
                     else:
                         adjlists[words[j]][words[i]] += 1.0
     # Normalize weights.
     for a in adjlists:
         sum_w = sum(w for _, w in adjlists[a].items())
         for b in adjlists[a]:
             adjlists[a][b] /= sum_w
     return adjlists
예제 #3
0
def _rank_all_words():
    segmenter = Segmenter()  #generation   sxhy dictp
    stopwords = get_stopwords()
    print("Start TextRank over the selected quatrains ...")
    quatrains = get_quatrains()
    adjlist = dict()
    for idx, poem in enumerate(quatrains):
        if 0 == (idx + 1) % 10000:
            print("[TextRank] Scanning %d/%d poems ..." %
                  (idx + 1, len(quatrains)))
        for sentence in poem['sentences']:
            segs = filter(lambda word: word not in stopwords,
                          segmenter.segment(sentence))  #分词结果
            for seg in segs:
                if seg not in adjlist:
                    adjlist[seg] = dict()
            for i, seg in enumerate(segs):
                for _, other in enumerate(segs[i + 1:]):
                    if seg != other:
                        adjlist[seg][other] = adjlist[seg][other]+1 \
                                if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg]+1 \
                                if seg in adjlist[other] else 1.0
    for word in adjlist:
        w_sum = sum(weight for other, weight in adjlist[word].items())  #权重增加
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum
    print("[TextRank] Weighted graph has been built.")
    _text_rank(adjlist)
예제 #4
0
def _rank_all_words():
    segmenter = Segmenter()  # 诗句分段器
    stopwords = get_stopwords()  # 停用词列表
    print "Start TextRank over the selected quatrains ..."
    quatrains = get_quatrains()  # 四行诗集合
    adjlist = dict()
    for idx, poem in enumerate(quatrains):  # 对于每首诗
        if 0 == (idx + 1) % 10000:
            print "[TextRank] Scanning %d/%d poems ..." % (idx + 1,
                                                           len(quatrains))
        for sentence in poem['sentences']:  # 对于每一句诗
            segs = filter(lambda word: word not in stopwords,
                          segmenter.segment(sentence))  # 得到不再停用词中的词段
            for seg in segs:  # 对于每个词段
                if seg not in adjlist:
                    adjlist[seg] = dict()  # 每个词段生成一个字典dict
            for i, seg in enumerate(segs):  # 对于每个词段
                for _, other in enumerate(
                        segs[i + 1:]):  # 去和后面的每个词段比较,实际是源于text_rank需要的网状结构图
                    if seg != other:  # 精巧的code
                        adjlist[seg][other] = adjlist[seg][other]+1 \
                                if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg]+1 \
                                if seg in adjlist[other] else 1.0
    for word in adjlist:
        w_sum = sum(
            weight
            for other, weight in adjlist[word].items())  # 求该word对应的所有词的权重综合
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum  # 求该word中每个value对应的权重平均值
    print "[TextRank] Weighted graph has been built."
    _text_rank(adjlist)
예제 #5
0
def get_pop_quatrains(num = 100000):
    cnts = get_word_cnts()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \
            for i, quatrain in enumerate(quatrains)]
    indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i])
    return [quatrains[index] for index in indexes[:min(num, len(indexes))]]
예제 #6
0
def _gen_train_data():
    sampled_poems = np.array(random_int_list(1, 70000, 4000))
    segmenter = Segmenter()  #generation   sxhy dict
    poems = get_pop_quatrains()  #获得较为流行的10万首诗
    random.shuffle(poems)  #重新排序
    ranks = get_word_ranks()  #Textrank  word  -rank_number
    print("Generating training data ...")
    data = []
    kw_data = []
    test_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            test_flag = True
            rows = []
            kw_row = []
            test_row = []
            if idx in sampled_poems:
                test_flag = False
            for sentence in sentences:
                rows.append([sentence])
                test_row.append([sentence])
                segs = list(
                    filter(lambda seg: seg in ranks,
                           segmenter.segment(sentence)))
                if 0 == len(segs):
                    flag = False
                    break
                keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y,
                                 segs)  #选取权重比较大的keywords
                kw_row.append(keyword)
                rows[-1].append(keyword)
            if flag and test_flag:
                data.extend(rows)
                kw_data.append(kw_row)
            if flag and test_flag is False:
                test_data.extend(test_row)

        if 0 == (idx + 1) % 2000:
            print("[Training Data] %d/%d poems are processed." %
                  (idx + 1, len(poems)))
    print(test_data)
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row) + '\n')
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row) + '\n')
    with codecs.open(test_path, 'w', 'utf-8') as fout:
        for test_row in test_data:
            fout.write('\t'.join(test_row) + '\n')
    print("Training data is generated.")
예제 #7
0
def gen_train_data():
    """获取每一句的keywords,拼起来写入文件"""
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    ranked_words = RankedWords()

    gen_data = list()
    plan_data = list()

    valid = True
    counter_line = 0
    print('len(poems)==>', len(poems))
    for poem in poems:
        # print(len(poem))
        if len(poem) != 4:
            # print(poem)
            valid = False
            continue
        context = start_of_sentence()
        keywords = list()
        for sentence in poem:
            counter_line += 1
            keyword = ''
            if len(sentence) != 7:
                valid = False
                break
            filterwords = list(
                filter(lambda x: x in ranked_words,
                       segmenter.segment(sentence)))
            if filterwords:
                keyword = filterwords[0]
            for word in filterwords:
                # print('word==>',word)
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            if keyword:
                gen_line = sentence + end_of_sentence() + \
                           '\t' + keyword + '\t' + context + '\n'
                keywords.append(keyword)
                gen_data.append(gen_line)
                context += sentence + end_of_sentence()
        plan_data.append(' '.join(keywords))
    with open(plan_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter + '\n')
    with open(gen_data_path, 'w') as fw:
        for data_iter in gen_data:
            fw.write(data_iter)

    print('counter_line==>', counter_line)
    del segmenter, poems, ranked_words
예제 #8
0
def _gen_word_cnts():
    counters = dict()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            segs = segmenter.segment(sentence)
            for seg in segs:
                counters[seg] = counters[seg]+1 if seg in counters else 1
        if 0 == (idx+1)%10000:
            print "[Word Count] %d/%d quatrains has been processed." %(idx+1, len(quatrains))
    with codecs.open(_wc_path, 'w', 'utf-8') as fout:
        json.dump(counters, fout)
예제 #9
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        if len(poem) != 4:
            continue  # Only consider quatrains.
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                #只考虑七字诀句
                valid = False
                break
            #get a list of selected words from this sentence
            #ignore all words if they are not in the ranked words list
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]

            # from all words in this sentence, get the word with highest text_rank score
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word

            gen_line = sentence + end_of_sentence() + \
                       '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            # plan data: each line is four keywords from the 4 sentences
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
예제 #10
0
    def _do_text_rank(self):
        print("Do text ranking ...")
        adjlists = self._get_adjlists()
        #adjlists = self._build_adjlists_from_tencent_embeddings()
        print("[TextRank] Total words: %d" % len(adjlists))

        # Value initialization.
        scores = dict()
        for word in adjlists:
            #score[0] is previous score, score[1] is new score
            scores[word] = [1.0, 1.0]

        # Synchronous value iterations.
        itr = 0
        #### train text rank here #####
        while True:
            sys.stdout.write("[TextRank] Iteration %d ..." % itr)
            sys.stdout.flush()
            for word, adjlist in adjlists.items():
                scores[word][1] = (1.0 - _damp) + _damp * \
                        sum(adjlists[other][word] * scores[other][0]
                                for other in adjlist)

            #eps is the difference between new score and previous score, used to check for convergence
            eps = 0
            for word in scores:
                eps = max(eps, abs(scores[word][0] - scores[word][1]))
                scores[word][0] = scores[word][1]
            print(" eps = %f" % eps)
            # if eps <= 1e-6:
            #     break
            #if itr == 200:  # train for only 200 iteration ###########################
            if itr == NUM_Of_ITERATIONS:
                break
            itr += 1

        # Dictionary-based comparison with TextRank score as a tie-breaker.
        segmenter = Segmenter()

        def cmp_key(x):
            word, score = x
            return (0 if word in segmenter.sxhy_dict else 1, -score)

        words = sorted([(word, score[0]) for word, score in scores.items()],
                       key=cmp_key)

        # Store ranked words and scores.
        with open(wordrank_path, 'w') as fout:
            json.dump(words, fout)
예제 #11
0
 def _train(self):
     print("Start training Word2Vec for planner ...")
     quatrains = get_quatrains()
     segmenter = Segmenter()
     seg_lists = []
     for idx, quatrain in enumerate(quatrains):
         seg_list = []
         for sentence in quatrain['sentences']:
             seg_list.extend([seg for seg in segmenter.segment(sentence) if seg in self.ranks])
         seg_lists.append(seg_list)
         if 0 == (idx+1)%10000:
             print("[Plan Word2Vec] %d/%d quatrains has been processed." %(idx+1, len(quatrains)))
     print("Hold on. This may take some time ...")
     self.model = models.Word2Vec(seg_lists, size = 512, min_count = 5)
     self.model.save(_model_path)
예제 #12
0
 def setImage(self):
     filePath, _ = QtWidgets.QFileDialog.getOpenFileName(
         None, "Select Image", "", "Image Files (*.png *.jpg *.jpeg *.bmp)")
     if filePath:
         pixmap = QtGui.QPixmap(filePath)
         pixmap = pixmap.scaled(self.imageLabel.width(),
                                self.imageLabel.height(),
                                QtCore.Qt.KeepAspectRatio)
         self.imageLabel.setPixmap(pixmap)
         self.imageLabel.setAlignment(QtCore.Qt.AlignCenter)
         self.thresholdInc.setEnabled(True)
         self.thresholdDec.setEnabled(True)
         self.thresholdVal.setEnabled(True)
         self.autoSegmentBtn.setEnabled(True)
         self.selectImageBtn.setEnabled(False)
     self.filePath = filePath
     self.segmenter = Segmenter(filePath)
예제 #13
0
def gen_train_data():
    print("Generating training data ...")
    segmenter = Segmenter()
    poems = Poems()
    poems.shuffle()
    ranked_words = RankedWords()
    plan_data = []
    gen_data = []
    for poem in poems:
        # 只处理四行七言的诗
        if len(poem) != 4:
            continue
        valid = True
        context = start_of_sentence()
        gen_lines = []
        keywords = []
        for sentence in poem:
            if len(sentence) != 7:
                valid = False
                break
            words = list(
                filter(lambda seg: seg in ranked_words,
                       segmenter.segment(sentence)))
            if len(words) == 0:
                valid = False
                break
            keyword = words[0]
            for word in words[1:]:
                if ranked_words.get_rank(word) < ranked_words.get_rank(
                        keyword):
                    keyword = word
            gen_line = sentence + end_of_sentence() + \
                    '\t' + keyword + '\t' + context + '\n'
            gen_lines.append(gen_line)
            keywords.append(keyword)
            context += sentence + end_of_sentence()
        if valid:
            plan_data.append('\t'.join(keywords) + '\n')
            gen_data.extend(gen_lines)
    with open(plan_data_path, 'w') as fout:
        for line in plan_data:
            fout.write(line)
    with open(gen_data_path, 'w') as fout:
        for line in gen_data:
            fout.write(line)
예제 #14
0
 def _get_adjlists(self):
     poems = Poems()
     segmenter = Segmenter()
     adjlists = collections.defaultdict(dict)
     for poem_set in poems:
         for poem in poem_set:
             words = segmenter.segment(poem)
             for i in range(len(words) - 1):
                 for j in range(i + 1, len(words)):
                     if words[j] not in adjlists[words[i]]:
                         adjlists[words[i]][words[j]] = 1.0
                     else:
                         adjlists[words[i]][words[j]] += 1.0
                     if words[i] not in adjlists[words[j]]:
                         adjlists[words[j]][words[i]] = 1.0
                     else:
                         adjlists[words[j]][words[i]] += 1.0
     return adjlists
예제 #15
0
 def _train(self):
     print "Start training Word2Vec for planner ..."
     quatrains = get_quatrains()
     segmenter = Segmenter()  # 对诗句分段和取其中的每个词不一样
     seg_lists = []
     for idx, quatrain in enumerate(quatrains):
         seg_list = []
         for sentence in quatrain['sentences']:
             seg_list.extend(
                 filter(lambda seg: seg in self.ranks,
                        segmenter.segment(sentence)))
         seg_lists.append(seg_list)
         if 0 == (idx + 1) % 10000:
             print "[Plan Word2Vec] %d/%d quatrains has been processed." % (
                 idx + 1, len(quatrains))
     print "Hold on. This may take some time ..."
     self.model = models.Word2Vec(seg_lists, size=512,
                                  min_count=5)  # 代表一个词向量类,生成的是词向量模型
     self.model.save(_model_path)
예제 #16
0
    def _build_adjlists_from_tencent_embeddings(self):
        print("[TextRank] Generating word graph ...")
        segmenter = Segmenter()
        poems = Poems()
        adjlists = dict(
        )  # 2D dict, dict[word1][word2]=prob(going from word1 to word2)
        wv = get_tencent_embedding_keyedVectors(_tencent_embedding_path)

        # Count number of co-occurrence.

        ######################## get a 2D cos sim matrix for all words ###################
        words = set()
        for poem in poems:
            for sentence in poem:
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.add(word)
        for word in words:
            if word not in adjlists:
                #initialize all words to a new dict()
                adjlists[word] = dict()

        for word in words:
            for other in words:

                if word == other:
                    continue

                if other in adjlists[word] or word in adjlists[other]:
                    continue

                sim = wv.similarity(word, other)
                adjlists[word][other] = sim
                adjlists[other][word] = sim

        # Normalize weights.
        for a in adjlists:
            sum_w = sum(w for _, w in adjlists[a].items())
            for b in adjlists[a]:
                adjlists[a][b] /= sum_w
        return adjlists
    def _do_text_rank(self):
        print("Do text ranking ...")
        adjlists = self._get_adjlists()
        print("[TextRank] Total words: %d" % len(adjlists))

        # Value initialization.
        scores = dict()
        for word in adjlists:
            scores[word] = [1.0, 1.0]

        # Synchronous value iterations.
        itr = 0
        while True:
            sys.stdout.write("[TextRank] Iteration %d ..." % itr)
            sys.stdout.flush()
            for word, adjlist in adjlists.items():
                scores[word][1] = (1.0 - _damp) + _damp * \
                        sum(adjlists[other][word] * scores[other][0]
                                for other in adjlist)
            eps = 0
            for word in scores:
                eps = max(eps, abs(scores[word][0] - scores[word][1]))
                scores[word][0] = scores[word][1]
            print(" eps = %f" % eps)
            if eps <= 1e-6:
                break
            itr += 1

        # Dictionary-based comparison with TextRank score as a tie-breaker.
        segmenter = Segmenter()

        def cmp_key(x):
            word, score = x
            return (0 if word in segmenter.sxhy_dict else 1, -score)

        words = sorted([(word, score[0]) for word, score in scores.items()],
                       key=cmp_key)

        # Store ranked words and scores.
        with open(wordrank_path, 'w') as fout:
            json.dump(words, fout)
예제 #18
0
def _gen_train_data():
    segmenter = Segmenter()
    poems = get_pop_quatrains()
    random.shuffle(poems)
    ranks = get_word_ranks()
    print "Generating training data ..."
    data = []
    kw_data = []
    for idx, poem in enumerate(poems):
        sentences = poem['sentences']
        if len(sentences) == 4:
            flag = True
            lines = u''
            rows = []
            kw_row = []
            for sentence in sentences:
                rows.append([sentence])
                segs = filter(lambda seg: seg in ranks,
                              segmenter.segment(sentence))
                if 0 == len(segs):  # 只要该行诗句存在不在ranks中的词则这一首诗都不能用
                    flag = False
                    break
                keyword = reduce(lambda x, y: x
                                 if ranks[x] < ranks[y] else y, segs)
                kw_row.append(keyword)
                rows[-1].append(keyword)  # rows的每一个元素是该行诗句加上对应的关键字数组
            if flag:
                data.extend(rows)  # 用extend,data的每一个元素和rows的每一个元素相同
                kw_data.append(kw_row)  # 用append
        if 0 == (idx + 1) % 2000:
            print "[Training Data] %d/%d poems are processed." % (idx + 1,
                                                                  len(poems))
    with codecs.open(train_path, 'w', 'utf-8') as fout:
        for row in data:
            fout.write('\t'.join(row) + '\n')  # 每一行都是用tab键分隔开的一行诗加上关键字序列
    with codecs.open(kw_train_path, 'w', 'utf-8') as fout:
        for kw_row in kw_data:
            fout.write('\t'.join(kw_row) + '\n')
    print "Training data is generated."
예제 #19
0
    def _do_text_rank(self):
        """scores,给所有词设置 双score    每个句子进行词语之间的组合, 迭代词语分数   给分数排序"""
        print("Do text ranking ...")
        segment = Segmenter()
        scores = dict()
        adjlists = self._get_adjlists()
        for word in adjlists:
            scores[word] = [1.0, 1.0]

        for word, adjust in adjlists.items():
            sums = sum([w for _, w in adjust.items()])
            for word, weight in adjust.items():
                adjust[word] = weight / sums
        _damp = 0.85
        while True:
            for word, adjust in adjlists.items():
                scores[word][1] = (1 - _damp) + _damp * sum([
                    scores[word][0] * adjlists[other][word] for other in adjust
                ])
            eps = 0.0
            for word in scores:
                eps = max(eps, scores[word][0] - scores[word][1])
                scores[word][0] = scores[word][1]
            print('eps=>', eps)
            if eps < 0.05:
                break

        def tmp_key(x):
            word, score = x
            return 0 if word in segment.sxhy_dict else -1, -score

        word_and_scores = sorted([(word, score[0])
                                  for word, score in scores.items()],
                                 key=tmp_key)
        with open(wordrank_path, 'w') as fw:
            json.dump(word_and_scores, fw)
        return scores
예제 #20
0
def test():
    s = Segmenter('test.png')

    s.threshold_and_morph(11)

    s.auto_segment()
예제 #21
0
# -*- coding: utf-8 -*-

from codecs import open
from itertools import imap
from math import log

from lexicon import Lexicon
from segment import Segmenter


def wrap(line):
    w, f = line.strip().split(' ')
    f = log(float(f) + 1.0)
    return (w, f)


with open('dict.txt', 'r', 'utf-8') as fin:
    tf = dict(imap(wrap, fin))
    lex = Lexicon(tf)
    seg = Segmenter(lex)
    result = seg.segment(u'這是一隻可愛的小花貓')
    print('/'.join(result).encode('utf-8'))
예제 #22
0
def main(page_array, conf=Config(viterbi_postprocess=False, line_break_method = None, page_type = None), retries=0,
         text=False, page_info={}):
    '''Main procedure for processing a page from start to finish
    
    Parameters:
    --------------------
    page_array: a 2 dimensional numpy array containing binary pixel data of 
        the image
    
    page_info: dictionary, optional
        A dictionary containing metadata about the page to be recognized.
        Define strings for the keywords "flname" and "volume" if saving
        a serialized copy of the OCR results. 

    retries: Used internally when system attempts to reboot a failed attempt
    
    text: boolean flag. If true, return text rather than char-position data
    
    Returns:
    --------------
    text: str
        Recognized text for entire page
        
    if text=False, return character position and label data as a python dictionary
    '''
    
    print page_info.get('flname','')
    
    confpath = conf.path
    conf = conf.conf
    
    line_break_method = conf['line_break_method']
    page_type = conf['page_type']

    ### Set the line_break method automatically if it hasn't been
    ### specified beforehand
    if not line_break_method and not page_type:
        if page_array.shape[1] > 2*page_array.shape[0]:
            print 'setting page type as pecha'
            line_break_method = 'line_cluster'
            page_type = 'pecha'
        else: 
            print 'setting page type as book'
            line_break_method = 'line_cut'
            page_type = 'book' 
            
    conf['page_type'] = page_type
    conf['line_break_method'] = line_break_method
    detect_o = conf.get('detect_o', False)
    print 'clear hr', conf.get('clear_hr', False)

    results = []
    out = u''
    try:
        ### Get information about the pages
        shapes = PE2(page_array, cls, page_type=page_type, 
                     low_ink=conf['low_ink'], 
                     flpath=page_info.get('flname',''),
                     detect_o=detect_o, 
                     clear_hr =  conf.get('clear_hr', False))
        shapes.conf = conf
        
        ### Separate the lines on a page
        if page_type == 'pecha':
            k_groups = shapes.num_lines
        shapes.viterbi_post = conf['viterbi_postprocess']
        
        if line_break_method == 'line_cut':
            line_info = LineCut(shapes)
            if not line_info: # immediately skip to re-run with LineCluster
                sys.exit()
        elif line_break_method == 'line_cluster':
            line_info = LineCluster(shapes, k=k_groups)
        
        
        ### Perform segmentation of characters
        segmentation = Segmenter(line_info)

        ###Perform recognition
        if not conf['viterbi_postprocess']:
            if conf['recognizer'] == 'probout':
                results = recognize_chars_probout(segmentation)
            elif conf['recognizer'] == 'hmm':
                results = recognize_chars_hmm(segmentation, trans_p, start_p)
            elif conf['recognizer'] == 'kama':
                results = recognize_chars_probout(segmentation)
                results = recognize_chars_kama(results, segmentation)
            if conf['postprocess']:
                results = viterbi_post_process(segmentation.line_info.shapes.img_arr, results)
        else: # Should only be call from *within* a non viterbi run...

            prob, results = hmm_recognize_bigram(segmentation)
            return prob, results
        
        
        ### Construct an output string
        output  = []
        for n, line in enumerate(results):
            for m,k in enumerate(line):
#                 if isinstance(k[-1], int):
#                     print n,m,k
#                     page_array[k[1]:k[1]+k[3], k[0]:k[0]+k[2]] = 0
#                     Image.fromarray(page_array*255).show()
                    
                output.append(k[-1])
            output.append(u'\n')

        out =  ''.join(output)
        print out
    
        if text:
            results = out
        
        return results
    except:
        ### Retry and assume the error was cause by use of the
        ### wrong line_break_method...
        import traceback;traceback.print_exc()
        if not results and not conf['viterbi_postprocess']:
            print 'WARNING', '*'*40
            print page_info['flname'], 'failed to return a result.'
            print 'WARNING', '*'*40
            print
            if line_break_method == 'line_cut' and retries < 1:
                print 'retrying with line_cluster instead of line_cut'
                try:
                    return main(page_array, conf=Config(path=confpath, line_break_method='line_cluster', page_type='pecha'), page_info=page_info, retries = 1, text=text)
                except:
                    logging.info('Exited after failure of second run.')
                    return []
        if not conf['viterbi_postprocess']: 
            if not results:
                logging.info('***** No OCR output for %s *****' % page_info['flname'])
            return results
예제 #23
0
 def generate_segmentation(self):  
     self.segmentation = Segmenter(self.line_info)
예제 #24
0
    def _get_adjlists(self):
        print("[TextRank] Generating word graph ...")
        segmenter = Segmenter()
        poems = Poems()
        adjlists = dict(
        )  # 2D dict, dict[word1][word2]=prob(going from word1 to word2)
        # Count number of co-occurrence.
        """
        ######################## count relationship per sentence ###################
        for poem in poems:
            for sentence in poem:
                words = []
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.append(word)
                for word in words:
                    if word not in adjlists:
                        #initialize all words to a new dict()
                        adjlists[word] = dict()
                for i in range(len(words)):
                    for j in range(i + 1, len(words)):
                        #### if two words present in the same sentence, their score +=1 #####
                        if words[j] not in adjlists[words[i]]:
                            adjlists[words[i]][words[j]] = 1.0
                        else:
                            adjlists[words[i]][words[j]] += 1.0
                        if words[i] not in adjlists[words[j]]:
                            adjlists[words[j]][words[i]] = 1.0
                        else:
                            adjlists[words[j]][words[i]] += 1.0

        ######################## end count relationship per sentence ###################
        """

        ######################## count relationship per poem ###################
        for poem in poems:
            for sentence in poem:
                words = []
                for word in segmenter.segment(sentence):
                    # for each word selected from the sentence
                    if word not in self.stopwords:
                        #keep only non-stopwords words
                        words.append(word)
            for word in words:
                if word not in adjlists:
                    #initialize all words to a new dict()
                    adjlists[word] = dict()
            for i in range(len(words)):
                for j in range(i + 1, len(words)):
                    #### if two words present in the same sentence, their score +=1 #####
                    if words[j] not in adjlists[words[i]]:
                        adjlists[words[i]][words[j]] = 1.0
                    else:
                        adjlists[words[i]][words[j]] += 1.0
                    if words[i] not in adjlists[words[j]]:
                        adjlists[words[j]][words[i]] = 1.0
                    else:
                        adjlists[words[j]][words[i]] += 1.0

        ######################## end count relationship per poem ###################

        # Normalize weights.
        for a in adjlists:
            sum_w = sum(w for _, w in adjlists[a].items())
            for b in adjlists[a]:
                adjlists[a][b] /= sum_w
        return adjlists