def _gen_train_data(): segmenter = Segmenter() poems = get_pop_quatrains() random.shuffle(poems) ranks = get_word_ranks() print("Generating training data ...") data = [] kw_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True rows = [] kw_row = [] for sentence in sentences: rows.append([sentence]) segs = list(filter(lambda seg: seg in ranks, segmenter.segment(sentence))) if 0 == len(segs): flag = False break keyword = reduce(lambda x,y: x if ranks[x] < ranks[y] else y, segs) kw_row.append(keyword) rows[-1].append(keyword) if flag: data.extend(rows) kw_data.append(kw_row) if 0 == (idx+1)%2000: print("[Training Data] %d/%d poems are processed." %(idx+1, len(poems))) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row)+'\n') with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row)+'\n') print("Training data is generated.")
def _get_adjlists(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict() # Count number of co-occurrence. for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): if word not in self.stopwords: words.append(word) for word in words: if word not in adjlists: adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def _rank_all_words(): segmenter = Segmenter() #generation sxhy dictp stopwords = get_stopwords() print("Start TextRank over the selected quatrains ...") quatrains = get_quatrains() adjlist = dict() for idx, poem in enumerate(quatrains): if 0 == (idx + 1) % 10000: print("[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(quatrains))) for sentence in poem['sentences']: segs = filter(lambda word: word not in stopwords, segmenter.segment(sentence)) #分词结果 for seg in segs: if seg not in adjlist: adjlist[seg] = dict() for i, seg in enumerate(segs): for _, other in enumerate(segs[i + 1:]): if seg != other: adjlist[seg][other] = adjlist[seg][other]+1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg]+1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum(weight for other, weight in adjlist[word].items()) #权重增加 for other in adjlist[word]: adjlist[word][other] /= w_sum print("[TextRank] Weighted graph has been built.") _text_rank(adjlist)
def _rank_all_words(): segmenter = Segmenter() # 诗句分段器 stopwords = get_stopwords() # 停用词列表 print "Start TextRank over the selected quatrains ..." quatrains = get_quatrains() # 四行诗集合 adjlist = dict() for idx, poem in enumerate(quatrains): # 对于每首诗 if 0 == (idx + 1) % 10000: print "[TextRank] Scanning %d/%d poems ..." % (idx + 1, len(quatrains)) for sentence in poem['sentences']: # 对于每一句诗 segs = filter(lambda word: word not in stopwords, segmenter.segment(sentence)) # 得到不再停用词中的词段 for seg in segs: # 对于每个词段 if seg not in adjlist: adjlist[seg] = dict() # 每个词段生成一个字典dict for i, seg in enumerate(segs): # 对于每个词段 for _, other in enumerate( segs[i + 1:]): # 去和后面的每个词段比较,实际是源于text_rank需要的网状结构图 if seg != other: # 精巧的code adjlist[seg][other] = adjlist[seg][other]+1 \ if other in adjlist[seg] else 1.0 adjlist[other][seg] = adjlist[other][seg]+1 \ if seg in adjlist[other] else 1.0 for word in adjlist: w_sum = sum( weight for other, weight in adjlist[word].items()) # 求该word对应的所有词的权重综合 for other in adjlist[word]: adjlist[word][other] /= w_sum # 求该word中每个value对应的权重平均值 print "[TextRank] Weighted graph has been built." _text_rank(adjlist)
def _gen_train_data(): sampled_poems = np.array(random_int_list(1, 70000, 4000)) segmenter = Segmenter() #generation sxhy dict poems = get_pop_quatrains() #获得较为流行的10万首诗 random.shuffle(poems) #重新排序 ranks = get_word_ranks() #Textrank word -rank_number print("Generating training data ...") data = [] kw_data = [] test_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True test_flag = True rows = [] kw_row = [] test_row = [] if idx in sampled_poems: test_flag = False for sentence in sentences: rows.append([sentence]) test_row.append([sentence]) segs = list( filter(lambda seg: seg in ranks, segmenter.segment(sentence))) if 0 == len(segs): flag = False break keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y, segs) #选取权重比较大的keywords kw_row.append(keyword) rows[-1].append(keyword) if flag and test_flag: data.extend(rows) kw_data.append(kw_row) if flag and test_flag is False: test_data.extend(test_row) if 0 == (idx + 1) % 2000: print("[Training Data] %d/%d poems are processed." % (idx + 1, len(poems))) print(test_data) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row) + '\n') with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row) + '\n') with codecs.open(test_path, 'w', 'utf-8') as fout: for test_row in test_data: fout.write('\t'.join(test_row) + '\n') print("Training data is generated.")
def gen_train_data(): """获取每一句的keywords,拼起来写入文件""" print("Generating training data ...") segmenter = Segmenter() poems = Poems() ranked_words = RankedWords() gen_data = list() plan_data = list() valid = True counter_line = 0 print('len(poems)==>', len(poems)) for poem in poems: # print(len(poem)) if len(poem) != 4: # print(poem) valid = False continue context = start_of_sentence() keywords = list() for sentence in poem: counter_line += 1 keyword = '' if len(sentence) != 7: valid = False break filterwords = list( filter(lambda x: x in ranked_words, segmenter.segment(sentence))) if filterwords: keyword = filterwords[0] for word in filterwords: # print('word==>',word) if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word if keyword: gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' keywords.append(keyword) gen_data.append(gen_line) context += sentence + end_of_sentence() plan_data.append(' '.join(keywords)) with open(plan_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter + '\n') with open(gen_data_path, 'w') as fw: for data_iter in gen_data: fw.write(data_iter) print('counter_line==>', counter_line) del segmenter, poems, ranked_words
def _gen_word_cnts(): counters = dict() segmenter = Segmenter() quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: segs = segmenter.segment(sentence) for seg in segs: counters[seg] = counters[seg]+1 if seg in counters else 1 if 0 == (idx+1)%10000: print "[Word Count] %d/%d quatrains has been processed." %(idx+1, len(quatrains)) with codecs.open(_wc_path, 'w', 'utf-8') as fout: json.dump(counters, fout)
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: if len(poem) != 4: continue # Only consider quatrains. valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: #只考虑七字诀句 valid = False break #get a list of selected words from this sentence #ignore all words if they are not in the ranked words list words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] # from all words in this sentence, get the word with highest text_rank score for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: # plan data: each line is four keywords from the 4 sentences plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def _train(self): print("Start training Word2Vec for planner ...") quatrains = get_quatrains() segmenter = Segmenter() seg_lists = [] for idx, quatrain in enumerate(quatrains): seg_list = [] for sentence in quatrain['sentences']: seg_list.extend([seg for seg in segmenter.segment(sentence) if seg in self.ranks]) seg_lists.append(seg_list) if 0 == (idx+1)%10000: print("[Plan Word2Vec] %d/%d quatrains has been processed." %(idx+1, len(quatrains))) print("Hold on. This may take some time ...") self.model = models.Word2Vec(seg_lists, size = 512, min_count = 5) self.model.save(_model_path)
def setImage(self): filePath, _ = QtWidgets.QFileDialog.getOpenFileName( None, "Select Image", "", "Image Files (*.png *.jpg *.jpeg *.bmp)") if filePath: pixmap = QtGui.QPixmap(filePath) pixmap = pixmap.scaled(self.imageLabel.width(), self.imageLabel.height(), QtCore.Qt.KeepAspectRatio) self.imageLabel.setPixmap(pixmap) self.imageLabel.setAlignment(QtCore.Qt.AlignCenter) self.thresholdInc.setEnabled(True) self.thresholdDec.setEnabled(True) self.thresholdVal.setEnabled(True) self.autoSegmentBtn.setEnabled(True) self.selectImageBtn.setEnabled(False) self.filePath = filePath self.segmenter = Segmenter(filePath)
def get_pop_quatrains(num = 100000): cnts = get_word_cnts() segmenter = Segmenter() quatrains = get_quatrains() min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \ for i, quatrain in enumerate(quatrains)] indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i]) return [quatrains[index] for index in indexes[:min(num, len(indexes))]]
def gen_train_data(): print("Generating training data ...") segmenter = Segmenter() poems = Poems() poems.shuffle() ranked_words = RankedWords() plan_data = [] gen_data = [] for poem in poems: # 只处理四行七言的诗 if len(poem) != 4: continue valid = True context = start_of_sentence() gen_lines = [] keywords = [] for sentence in poem: if len(sentence) != 7: valid = False break words = list( filter(lambda seg: seg in ranked_words, segmenter.segment(sentence))) if len(words) == 0: valid = False break keyword = words[0] for word in words[1:]: if ranked_words.get_rank(word) < ranked_words.get_rank( keyword): keyword = word gen_line = sentence + end_of_sentence() + \ '\t' + keyword + '\t' + context + '\n' gen_lines.append(gen_line) keywords.append(keyword) context += sentence + end_of_sentence() if valid: plan_data.append('\t'.join(keywords) + '\n') gen_data.extend(gen_lines) with open(plan_data_path, 'w') as fout: for line in plan_data: fout.write(line) with open(gen_data_path, 'w') as fout: for line in gen_data: fout.write(line)
def _get_adjlists(self): poems = Poems() segmenter = Segmenter() adjlists = collections.defaultdict(dict) for poem_set in poems: for poem in poem_set: words = segmenter.segment(poem) for i in range(len(words) - 1): for j in range(i + 1, len(words)): if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 return adjlists
def _train(self): print "Start training Word2Vec for planner ..." quatrains = get_quatrains() segmenter = Segmenter() # 对诗句分段和取其中的每个词不一样 seg_lists = [] for idx, quatrain in enumerate(quatrains): seg_list = [] for sentence in quatrain['sentences']: seg_list.extend( filter(lambda seg: seg in self.ranks, segmenter.segment(sentence))) seg_lists.append(seg_list) if 0 == (idx + 1) % 10000: print "[Plan Word2Vec] %d/%d quatrains has been processed." % ( idx + 1, len(quatrains)) print "Hold on. This may take some time ..." self.model = models.Word2Vec(seg_lists, size=512, min_count=5) # 代表一个词向量类,生成的是词向量模型 self.model.save(_model_path)
def _build_adjlists_from_tencent_embeddings(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict( ) # 2D dict, dict[word1][word2]=prob(going from word1 to word2) wv = get_tencent_embedding_keyedVectors(_tencent_embedding_path) # Count number of co-occurrence. ######################## get a 2D cos sim matrix for all words ################### words = set() for poem in poems: for sentence in poem: for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.add(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for word in words: for other in words: if word == other: continue if other in adjlists[word] or word in adjlists[other]: continue sim = wv.similarity(word, other) adjlists[word][other] = sim adjlists[other][word] = sim # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def _gen_train_data(): segmenter = Segmenter() poems = get_pop_quatrains() random.shuffle(poems) ranks = get_word_ranks() print "Generating training data ..." data = [] kw_data = [] for idx, poem in enumerate(poems): sentences = poem['sentences'] if len(sentences) == 4: flag = True lines = u'' rows = [] kw_row = [] for sentence in sentences: rows.append([sentence]) segs = filter(lambda seg: seg in ranks, segmenter.segment(sentence)) if 0 == len(segs): # 只要该行诗句存在不在ranks中的词则这一首诗都不能用 flag = False break keyword = reduce(lambda x, y: x if ranks[x] < ranks[y] else y, segs) kw_row.append(keyword) rows[-1].append(keyword) # rows的每一个元素是该行诗句加上对应的关键字数组 if flag: data.extend(rows) # 用extend,data的每一个元素和rows的每一个元素相同 kw_data.append(kw_row) # 用append if 0 == (idx + 1) % 2000: print "[Training Data] %d/%d poems are processed." % (idx + 1, len(poems)) with codecs.open(train_path, 'w', 'utf-8') as fout: for row in data: fout.write('\t'.join(row) + '\n') # 每一行都是用tab键分隔开的一行诗加上关键字序列 with codecs.open(kw_train_path, 'w', 'utf-8') as fout: for kw_row in kw_data: fout.write('\t'.join(kw_row) + '\n') print "Training data is generated."
def _do_text_rank(self): print("Do text ranking ...") adjlists = self._get_adjlists() #adjlists = self._build_adjlists_from_tencent_embeddings() print("[TextRank] Total words: %d" % len(adjlists)) # Value initialization. scores = dict() for word in adjlists: #score[0] is previous score, score[1] is new score scores[word] = [1.0, 1.0] # Synchronous value iterations. itr = 0 #### train text rank here ##### while True: sys.stdout.write("[TextRank] Iteration %d ..." % itr) sys.stdout.flush() for word, adjlist in adjlists.items(): scores[word][1] = (1.0 - _damp) + _damp * \ sum(adjlists[other][word] * scores[other][0] for other in adjlist) #eps is the difference between new score and previous score, used to check for convergence eps = 0 for word in scores: eps = max(eps, abs(scores[word][0] - scores[word][1])) scores[word][0] = scores[word][1] print(" eps = %f" % eps) # if eps <= 1e-6: # break #if itr == 200: # train for only 200 iteration ########################### if itr == NUM_Of_ITERATIONS: break itr += 1 # Dictionary-based comparison with TextRank score as a tie-breaker. segmenter = Segmenter() def cmp_key(x): word, score = x return (0 if word in segmenter.sxhy_dict else 1, -score) words = sorted([(word, score[0]) for word, score in scores.items()], key=cmp_key) # Store ranked words and scores. with open(wordrank_path, 'w') as fout: json.dump(words, fout)
def _do_text_rank(self): print("Do text ranking ...") adjlists = self._get_adjlists() print("[TextRank] Total words: %d" % len(adjlists)) # Value initialization. scores = dict() for word in adjlists: scores[word] = [1.0, 1.0] # Synchronous value iterations. itr = 0 while True: sys.stdout.write("[TextRank] Iteration %d ..." % itr) sys.stdout.flush() for word, adjlist in adjlists.items(): scores[word][1] = (1.0 - _damp) + _damp * \ sum(adjlists[other][word] * scores[other][0] for other in adjlist) eps = 0 for word in scores: eps = max(eps, abs(scores[word][0] - scores[word][1])) scores[word][0] = scores[word][1] print(" eps = %f" % eps) if eps <= 1e-6: break itr += 1 # Dictionary-based comparison with TextRank score as a tie-breaker. segmenter = Segmenter() def cmp_key(x): word, score = x return (0 if word in segmenter.sxhy_dict else 1, -score) words = sorted([(word, score[0]) for word, score in scores.items()], key=cmp_key) # Store ranked words and scores. with open(wordrank_path, 'w') as fout: json.dump(words, fout)
def __init__(self, punct_file=None, stop_file=None, once_file=None, reserve_file=None, area_file=None, color_file=None, quantifier_file=None, num_file=None): cur_dir = os.path.dirname(os.path.abspath(__file__)) if not punct_file: punct_file = cur_dir + '/dict/punct.txt' if not stop_file: stop_file = cur_dir + '/dict/stop_words.txt' if not once_file: once_file = cur_dir + '/dict/once.words' if not reserve_file: reserve_file = cur_dir + '/dict/reserve_words.txt' self.segmenter = Segmenter() self.punct = set() self.load_punct = (punct_file) self.stop_words = set() self.load_stop_words(stop_file) self.remove_words = set() self.load_remove_words(once_file) self.reserve_words = set() self.load_reserve_words(reserve_file) self.replace_lst = [(u'斜跨包', u'斜挎包'), (u'!', u','), (u'。', u','), (u',', u','), (u'市场价', u''), (u'全国包邮', u''), (u'包邮', u''), (u'【', u''), (u'】', u''), (u'[', u''), (u']', u''), (u'《', u''), (u'》', u'')] self.word_label = WordLabel(area_file=area_file, color_file=None, quantifier_file=None, num_file=None)
def _do_text_rank(self): """scores,给所有词设置 双score 每个句子进行词语之间的组合, 迭代词语分数 给分数排序""" print("Do text ranking ...") segment = Segmenter() scores = dict() adjlists = self._get_adjlists() for word in adjlists: scores[word] = [1.0, 1.0] for word, adjust in adjlists.items(): sums = sum([w for _, w in adjust.items()]) for word, weight in adjust.items(): adjust[word] = weight / sums _damp = 0.85 while True: for word, adjust in adjlists.items(): scores[word][1] = (1 - _damp) + _damp * sum([ scores[word][0] * adjlists[other][word] for other in adjust ]) eps = 0.0 for word in scores: eps = max(eps, scores[word][0] - scores[word][1]) scores[word][0] = scores[word][1] print('eps=>', eps) if eps < 0.05: break def tmp_key(x): word, score = x return 0 if word in segment.sxhy_dict else -1, -score word_and_scores = sorted([(word, score[0]) for word, score in scores.items()], key=tmp_key) with open(wordrank_path, 'w') as fw: json.dump(word_and_scores, fw) return scores
def generate_segmentation(self): self.segmentation = Segmenter(self.line_info)
class WordFeature(object): def __init__(self, punct_file=None, stop_file=None, once_file=None, reserve_file=None, area_file=None, color_file=None, quantifier_file=None, num_file=None): cur_dir = os.path.dirname(os.path.abspath(__file__)) if not punct_file: punct_file = cur_dir + '/dict/punct.txt' if not stop_file: stop_file = cur_dir + '/dict/stop_words.txt' if not once_file: once_file = cur_dir + '/dict/once.words' if not reserve_file: reserve_file = cur_dir + '/dict/reserve_words.txt' self.segmenter = Segmenter() self.punct = set() self.load_punct = (punct_file) self.stop_words = set() self.load_stop_words(stop_file) self.remove_words = set() self.load_remove_words(once_file) self.reserve_words = set() self.load_reserve_words(reserve_file) self.replace_lst = [(u'斜跨包', u'斜挎包'), (u'!', u','), (u'。', u','), (u',', u','), (u'市场价', u''), (u'全国包邮', u''), (u'包邮', u''), (u'【', u''), (u'】', u''), (u'[', u''), (u']', u''), (u'《', u''), (u'》', u'')] self.word_label = WordLabel(area_file=area_file, color_file=None, quantifier_file=None, num_file=None) def _add_char_to_set(self, myset, filename): with open(filename, 'r') as f: lines = f.readlines() for l in lines: lines = l.rstrip('\n').decode('utf-8') for c in lines: myset.add(c) def load_punct(self, filename): self._add_char_to_set(self.punct, filename) def load_stop_words(self, filename): with open(filename, 'r') as f: for line in f: self.stop_words.add(line.rstrip('\n').decode('utf-8')) def load_remove_words(self, filename): with open(filename, 'r') as f: for line in f: self.remove_words.add(line.rstrip('\n').decode('utf-8')) def load_reserve_words(self, filename): with open(filename, 'r') as f: for line in f: self.reserve_words.add(line.rstrip('\n').decode('utf-8').lower()) def check_is_mode(self, word): has_hyphen = False for c in word: if c == u'-': has_hyphen = True if (c < u'a' and c > u'z') and (c < u'0' and c > u'9'): return False return has_hyphen def check_valid_new(self, word): if word in self.reserve_words: return True if not word: return False if word.isnumeric(): return False # unicode 编码无法使用 isalnum() if word.encode("u8").isalnum() and len(word) <= 3: return False # if len(word) == 1 and ord(word) < 256: if len(word) == 1: return False if word in self.punct: return False if word in self.stop_words: return False if word in self.remove_words: return False if self.check_is_mode(word): return False try: float(word) return False except: pass return True def check_valid(self, word): if not word: return False if word.isnumeric(): return False if word in self.punct: return False if len(word) == 1 and ord(word) < 256: return False if word[0].isdigit(): return False if word in self.stop_words: return False if word in self.remove_words: return False if self.check_is_mode(word): return False return True def convert_word_features(self, text): words = self.segmenter.segment(text.lower().strip()) features = {} word0 = "" for word in words: word = word.strip().replace(u'(', u'').replace(u')', u'').replace(u'(', u'').replace(u')', u'') if not word: continue word = self.word_label.word_label(word, word0) word0 = word if not self.check_valid(word): continue features[word] = 1 return features def convert_all(self, cid, name, cat, brand, price): remove_cat_count = 0 try: config = zk_conf.get_client(cid) if config and "category_remove" in config: remove_cat_count = config["category_remove"] except Exception, e: logging.error("category_remove: %s", e) try: cat= json.dumps(json.loads(cat)[remove_cat_count:], separators=(',',':'), ensure_ascii=False) except: cat = u'[]' if brand.endswith(u'公司'): brand = u'' name = self.extract_sentence(name) sample = self.convert_features_with_all(name, cat, brand, price) return (cid, name, cat, brand, price, sample)
def test(): s = Segmenter('test.png') s.threshold_and_morph(11) s.auto_segment()
# -*- coding: utf-8 -*- from codecs import open from itertools import imap from math import log from lexicon import Lexicon from segment import Segmenter def wrap(line): w, f = line.strip().split(" ") f = log(float(f) + 1.0) return (w, f) with open("dict.txt", "r", "utf-8") as fin: tf = dict(imap(wrap, fin)) lex = Lexicon(tf) seg = Segmenter(lex) result = seg.segment(u"這是一隻可愛的小花貓") print "/".join(result).encode("utf-8")
class Ui_MainWindow(object): def setupUi(self, MainWindow): MainWindow.setObjectName("MainWindow") MainWindow.resize(570, 351) self.centralwidget = QtWidgets.QWidget(MainWindow) self.centralwidget.setObjectName("centralwidget") self.selectImageBtn = QtWidgets.QPushButton(self.centralwidget) self.selectImageBtn.setGeometry(QtCore.QRect(30, 300, 93, 28)) self.selectImageBtn.setObjectName("selectImageBtn") self.imageLabel = QtWidgets.QLabel(self.centralwidget) self.imageLabel.setGeometry(QtCore.QRect(20, 10, 531, 261)) self.imageLabel.setFrameShape(QtWidgets.QFrame.Box) self.imageLabel.setText("") self.imageLabel.setObjectName("imageLabel") self.submitBtn = QtWidgets.QPushButton(self.centralwidget) self.submitBtn.setGeometry(QtCore.QRect(460, 300, 93, 28)) self.submitBtn.setObjectName("submitBtn") self.thresholdDec = QtWidgets.QPushButton(self.centralwidget) self.thresholdDec.setGeometry(QtCore.QRect(150, 300, 31, 28)) self.thresholdDec.setObjectName("thresholdDec") self.thresholdInc = QtWidgets.QPushButton(self.centralwidget) self.thresholdInc.setGeometry(QtCore.QRect(250, 300, 31, 28)) self.thresholdInc.setObjectName("thresholdInc") self.thresholdVal = QtWidgets.QLineEdit(self.centralwidget) self.thresholdVal.setGeometry(QtCore.QRect(190, 300, 51, 31)) self.thresholdVal.setObjectName("thresholdVal") self.thresholdVal.isReadOnly() self.thresholdVal.setAlignment(QtCore.Qt.AlignCenter) self.thresholdVal.setReadOnly(True) self.thresholdVal.setText("5") self.autoSegmentBtn = QtWidgets.QPushButton(self.centralwidget) self.autoSegmentBtn.setGeometry(QtCore.QRect(320, 300, 111, 28)) self.autoSegmentBtn.setObjectName("autoSegmentBtn") MainWindow.setCentralWidget(self.centralwidget) self.filePath = "" #setting buttons to be disabled at the beginning self.submitBtn.setEnabled(False) self.thresholdDec.setEnabled(False) self.thresholdInc.setEnabled(False) self.thresholdVal.setEnabled(False) self.autoSegmentBtn.setEnabled(False) self.retranslateUi(MainWindow) QtCore.QMetaObject.connectSlotsByName(MainWindow) self.selectImageBtn.clicked.connect(self.setImage) self.thresholdInc.clicked.connect(self.increaseThreshold) self.thresholdDec.clicked.connect(self.decreaseThreshold) self.autoSegmentBtn.clicked.connect(self.autoSegment) #self.submitBtn.clicked.connect(self.uploadImage) def retranslateUi(self, MainWindow): _translate = QtCore.QCoreApplication.translate MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow")) self.selectImageBtn.setText(_translate("MainWindow", "Select Image")) self.submitBtn.setText(_translate("MainWindow", "Submit")) self.thresholdDec.setText(_translate("MainWindow", "▼")) self.thresholdInc.setText(_translate("MainWindow", "▲")) self.autoSegmentBtn.setText(_translate("MainWindow", "Auto-Segment")) def setImage(self): filePath, _ = QtWidgets.QFileDialog.getOpenFileName( None, "Select Image", "", "Image Files (*.png *.jpg *.jpeg *.bmp)") if filePath: pixmap = QtGui.QPixmap(filePath) pixmap = pixmap.scaled(self.imageLabel.width(), self.imageLabel.height(), QtCore.Qt.KeepAspectRatio) self.imageLabel.setPixmap(pixmap) self.imageLabel.setAlignment(QtCore.Qt.AlignCenter) self.thresholdInc.setEnabled(True) self.thresholdDec.setEnabled(True) self.thresholdVal.setEnabled(True) self.autoSegmentBtn.setEnabled(True) self.selectImageBtn.setEnabled(False) self.filePath = filePath self.segmenter = Segmenter(filePath) def uploadImage(self): pass def increaseThreshold(self): val = (int)(self.thresholdVal.text()) self.thresholdVal.setText(str(val + 1)) val = val + 1 self.segmenter.threshold_and_morph(val) def decreaseThreshold(self): val = (int)(self.thresholdVal.text()) if val == 1: return self.thresholdVal.setText(str(val - 1)) val = val - 1 self.segmenter.threshold_and_morph(val) def autoSegment(self): self.segmenter.auto_segment((int)(self.thresholdVal.text())) self.showdialog() def showdialog(self): msg = QtWidgets.QMessageBox() msg.about(self.centralwidget, "Done", "Success!")
# -*- coding: utf-8 -*- from codecs import open from itertools import imap from math import log from lexicon import Lexicon from segment import Segmenter def wrap(line): w, f = line.strip().split(' ') f = log(float(f) + 1.0) return (w, f) with open('dict.txt', 'r', 'utf-8') as fin: tf = dict(imap(wrap, fin)) lex = Lexicon(tf) seg = Segmenter(lex) result = seg.segment(u'這是一隻可愛的小花貓') print('/'.join(result).encode('utf-8'))
def _get_adjlists(self): print("[TextRank] Generating word graph ...") segmenter = Segmenter() poems = Poems() adjlists = dict( ) # 2D dict, dict[word1][word2]=prob(going from word1 to word2) # Count number of co-occurrence. """ ######################## count relationship per sentence ################### for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.append(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): #### if two words present in the same sentence, their score +=1 ##### if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 ######################## end count relationship per sentence ################### """ ######################## count relationship per poem ################### for poem in poems: for sentence in poem: words = [] for word in segmenter.segment(sentence): # for each word selected from the sentence if word not in self.stopwords: #keep only non-stopwords words words.append(word) for word in words: if word not in adjlists: #initialize all words to a new dict() adjlists[word] = dict() for i in range(len(words)): for j in range(i + 1, len(words)): #### if two words present in the same sentence, their score +=1 ##### if words[j] not in adjlists[words[i]]: adjlists[words[i]][words[j]] = 1.0 else: adjlists[words[i]][words[j]] += 1.0 if words[i] not in adjlists[words[j]]: adjlists[words[j]][words[i]] = 1.0 else: adjlists[words[j]][words[i]] += 1.0 ######################## end count relationship per poem ################### # Normalize weights. for a in adjlists: sum_w = sum(w for _, w in adjlists[a].items()) for b in adjlists[a]: adjlists[a][b] /= sum_w return adjlists
def main(page_array, conf=Config(viterbi_postprocess=False, line_break_method = None, page_type = None), retries=0, text=False, page_info={}): '''Main procedure for processing a page from start to finish Parameters: -------------------- page_array: a 2 dimensional numpy array containing binary pixel data of the image page_info: dictionary, optional A dictionary containing metadata about the page to be recognized. Define strings for the keywords "flname" and "volume" if saving a serialized copy of the OCR results. retries: Used internally when system attempts to reboot a failed attempt text: boolean flag. If true, return text rather than char-position data Returns: -------------- text: str Recognized text for entire page if text=False, return character position and label data as a python dictionary ''' print page_info.get('flname','') confpath = conf.path conf = conf.conf line_break_method = conf['line_break_method'] page_type = conf['page_type'] ### Set the line_break method automatically if it hasn't been ### specified beforehand if not line_break_method and not page_type: if page_array.shape[1] > 2*page_array.shape[0]: print 'setting page type as pecha' line_break_method = 'line_cluster' page_type = 'pecha' else: print 'setting page type as book' line_break_method = 'line_cut' page_type = 'book' conf['page_type'] = page_type conf['line_break_method'] = line_break_method detect_o = conf.get('detect_o', False) print 'clear hr', conf.get('clear_hr', False) results = [] out = u'' try: ### Get information about the pages shapes = PE2(page_array, cls, page_type=page_type, low_ink=conf['low_ink'], flpath=page_info.get('flname',''), detect_o=detect_o, clear_hr = conf.get('clear_hr', False)) shapes.conf = conf ### Separate the lines on a page if page_type == 'pecha': k_groups = shapes.num_lines shapes.viterbi_post = conf['viterbi_postprocess'] if line_break_method == 'line_cut': line_info = LineCut(shapes) if not line_info: # immediately skip to re-run with LineCluster sys.exit() elif line_break_method == 'line_cluster': line_info = LineCluster(shapes, k=k_groups) ### Perform segmentation of characters segmentation = Segmenter(line_info) ###Perform recognition if not conf['viterbi_postprocess']: if conf['recognizer'] == 'probout': results = recognize_chars_probout(segmentation) elif conf['recognizer'] == 'hmm': results = recognize_chars_hmm(segmentation, trans_p, start_p) elif conf['recognizer'] == 'kama': results = recognize_chars_probout(segmentation) results = recognize_chars_kama(results, segmentation) if conf['postprocess']: results = viterbi_post_process(segmentation.line_info.shapes.img_arr, results) else: # Should only be call from *within* a non viterbi run... prob, results = hmm_recognize_bigram(segmentation) return prob, results ### Construct an output string output = [] for n, line in enumerate(results): for m,k in enumerate(line): # if isinstance(k[-1], int): # print n,m,k # page_array[k[1]:k[1]+k[3], k[0]:k[0]+k[2]] = 0 # Image.fromarray(page_array*255).show() output.append(k[-1]) output.append(u'\n') out = ''.join(output) print out if text: results = out return results except: ### Retry and assume the error was cause by use of the ### wrong line_break_method... import traceback;traceback.print_exc() if not results and not conf['viterbi_postprocess']: print 'WARNING', '*'*40 print page_info['flname'], 'failed to return a result.' print 'WARNING', '*'*40 print if line_break_method == 'line_cut' and retries < 1: print 'retrying with line_cluster instead of line_cut' try: return main(page_array, conf=Config(path=confpath, line_break_method='line_cluster', page_type='pecha'), page_info=page_info, retries = 1, text=text) except: logging.info('Exited after failure of second run.') return [] if not conf['viterbi_postprocess']: if not results: logging.info('***** No OCR output for %s *****' % page_info['flname']) return results