def pro_triples_data(self, batch_size): """Generate triples data, reads lines from file and encodes words. """ self.prev_sent = list() self.curr_sent = list() self.next_sent = list() with open(self.fname) as f: for line in f: line = line.strip() if not line: continue seg_line = seg_sentence(line, self.line_min_words) if not seg_line: continue triples = self.make_lines_triples(seg_line) self.prev_sent.extend(triples[0]) self.curr_sent.extend(triples[1]) self.next_sent.extend(triples[2]) if len(self.curr_sent) < batch_size: continue # TODO: Optimize batch-data initialization. for data_iter in self.triples_data_iterator( self.prev_sent, self.curr_sent, self.next_sent, max_len=self.max_len, batch_size=batch_size): yield data_iter self.prev_sent = list() self.curr_sent = list() self.next_sent = list()
def _build_vocabulary_and_stats(self): """Builds vocabulary, calculates maximum length and total number of lines in file. """ with open(self.fname) as f: # self.vocab = Vocab() self.total_lines = 0 for line in f: tokens = self._tok_line(line) tmp_max_len = max( map(len, map(self._tok_line, seg_sentence(line)))) + 2 # 2 = len([<go>, <eos>]) if tmp_max_len > self.max_len: self.max_len = tmp_max_len if not tokens: continue self.vocab.add_words(tokens) self.total_lines += 1 if self.total_lines % self.verbose == 0: self._logger.info('Read\t{0} lines.'.format( self.total_lines)) self.vocab.cut_by_freq(self.max_vocab_size) self._logger.info('Done building vocab and stats.')
def pro_tuple_data(self, out_file_name, batch_size=1): """Generate one tuple data, reads lines from file and encodes words. """ if not out_file_name or not os.path.exists(out_file_name): return curr_sent = [] with open(out_file_name) as f: for line in f: line = line.strip() if not line: continue seg_line = seg_sentence(line, self.line_min_words) if not seg_line: continue curr_sent.extend(filter(lambda x: x, seg_line)) for data_iter in self.lines_curr_iterator( curr_sent, batch_size=batch_size): yield data_iter yield TextData.ONE_LINE_TOKEN curr_sent = []
def post(self): self.set_header("Content-Type", "text/html") src = self.get_argument("message") view_similar = str(self.get_argument("view_similar")) simi_log.info(self.request.remote_ip + ', similar demo: ' + src) data = dict() data["top"] = 10 data["src"] = src result = get_aysimi_demo(data["src"], data["top"]) stopword_set = set([ "年", "月", "日", "的", "了", "将", "诉称", "后", "于", "并", "但", "与", "元", "万元", "”", "、", "《", "》", ":", ";", ",", "。" ]) src_word_list = jieba.cut(src, cut_all=False) src_word_set = set() digits = re.compile(r"\d+") for word in src_word_list: digit_match = re.match(digits, word) if digit_match: continue if word.encode() not in stopword_set: src_word_set.add(word) from matrixText.matrix_seg import seg_sentence src_list = seg_sentence(src) self.write( '<!DOCTYPE html>' '<html><head>' '<meta http-equiv="content-type" content="text/html;charset=utf-8">' '<title>相似案例结果</title>') self.write("</head>") self.write("<body>") self.write("<div>") self.write('<table border = "1">' '<tr>' '<th style="width: 40%">' + '源文' + '</th>' '<th style="width: 60%">相似案例</th>' '</tr>') separation = '<br>' i = 1 for res_one in result: if "src" not in res_one.keys(): continue if 'simi_line' == view_similar: self.write('<tr>') self.write('<td>') self.write("<p>" + str(i) + ": ") self.write(separation) res_test_sym = list() res_norm_sym = list() for j in range(len(res_one['row_col'])): res_test_sym.append(res_one['row_col'][j][0]) res_norm_sym.append(res_one['row_col'][j][1]) for j in range(len(src_list)): if j in res_test_sym: if res_test_sym.index(j) == 0: self.write("<b><font color=\"red\">" + '[ ' + str(j) + ' - ' + str(res_norm_sym[0]) + ' ]' + '<sub>' + str(0) + '</sub>' + ': ' + src_list[j] + "</font></b>") self.write(separation) elif res_test_sym.index(j) in [1, 2, 3]: self.write( "<b><font color=\"blue\">" + '[ ' + str(j) + ' - ' + str(res_norm_sym[res_test_sym.index(j)]) + ' ]' + '<sub>' + str(res_test_sym.index(j)) + '</sub>' + ': ' + src_list[j] + "</font></b>") self.write(separation) else: self.write( "<b><font color=\"olive\">" + '[ ' + str(j) + ' - ' + str(res_norm_sym[res_test_sym.index(j)]) + ' ]' + '<sub>' + str(res_test_sym.index(j)) + '</sub>' + ': ' + "</font></b>") self.write(src_list[j]) self.write(separation) elif src_list[j] and not re.compile(r'^\s*\n*$').match( src_list[j]): self.write('[' + str(j) + ']: ' + src_list[j]) self.write(separation) self.write('</td>') self.write('<td>') self.write("<p>" + str(i) + ": ") self.write(separation) doc_list = seg_sentence(res_one["src"]) for j in range(len(doc_list)): if j in res_norm_sym: if res_norm_sym.index(j) == 0: self.write("<b><font color=\"red\">" + '[ ' + str(j) + ' - ' + str(res_test_sym[0]) + ' ]' + '<sub>' + str(0) + '</sub>' + ': ' + doc_list[j] + "</font></b>") self.write(separation) elif res_norm_sym.index(j) in [1, 2, 3]: self.write( "<b><font color=\"blue\">" + '[ ' + str(j) + ' - ' + str(res_test_sym[res_norm_sym.index(j)]) + ' ]' + '<sub>' + str(res_norm_sym.index(j)) + '</sub>' + ': ' + doc_list[j] + "</font></b>") self.write(separation) else: self.write( "<b><font color=\"olive\">" + '[ ' + str(j) + ' - ' + str(res_test_sym[res_norm_sym.index(j)]) + ' ]' + '<sub>' + str(res_norm_sym.index(j)) + '</sub>' + ': ' + "</font></b>") self.write(doc_list[j]) self.write(separation) elif doc_list[j] and not re.compile(r'^\s*\n*$').match( doc_list[j]): self.write('[' + str(j) + ']: ' + doc_list[j]) self.write(separation) self.write('</td>') self.write('</tr>') elif 'simi_word' == view_similar or view_similar == []: self.write('<tr>') self.write('<td>') self.write("<p>" + str(i) + ": ") self.write(separation) self.write(str(src)) self.write('</td>') self.write('<td>') self.write("<p>" + str(i) + ": ") self.write(separation) target_word_list = jieba.cut(res_one["src"], cut_all=False) sameCount = 0 sameWords = "" for word in target_word_list: if word in src_word_set: sameWords += word sameCount += 1 else: if sameCount >= 3: self.write("<b><font color=\"green\">" + sameWords + "</font></b>") else: self.write(sameWords) sameCount = 0 sameWords = "" self.write(word) if sameCount > 0: if sameCount >= 2: self.write("<b><font color=\"green\">" + sameWords + "</font></b>") else: self.write(sameWords) self.write('</td>') self.write('</tr>') i += 1 self.write('</table>') self.write("</div>") self.write("</body></html>")