Exemplo n.º 1
0
    def pro_triples_data(self, batch_size):
        """Generate triples data, reads lines from file and encodes words.
        """
        self.prev_sent = list()
        self.curr_sent = list()
        self.next_sent = list()
        with open(self.fname) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                seg_line = seg_sentence(line, self.line_min_words)
                if not seg_line:
                    continue

                triples = self.make_lines_triples(seg_line)
                self.prev_sent.extend(triples[0])
                self.curr_sent.extend(triples[1])
                self.next_sent.extend(triples[2])
                if len(self.curr_sent) < batch_size:
                    continue
                # TODO: Optimize batch-data initialization.
                for data_iter in self.triples_data_iterator(
                        self.prev_sent,
                        self.curr_sent,
                        self.next_sent,
                        max_len=self.max_len,
                        batch_size=batch_size):
                    yield data_iter

                self.prev_sent = list()
                self.curr_sent = list()
                self.next_sent = list()
Exemplo n.º 2
0
    def _build_vocabulary_and_stats(self):
        """Builds vocabulary, calculates maximum length and total number of
        lines in file.
        """
        with open(self.fname) as f:
            # self.vocab = Vocab()
            self.total_lines = 0
            for line in f:
                tokens = self._tok_line(line)
                tmp_max_len = max(
                    map(len,
                        map(self._tok_line,
                            seg_sentence(line)))) + 2  # 2 = len([<go>, <eos>])
                if tmp_max_len > self.max_len:
                    self.max_len = tmp_max_len
                if not tokens:
                    continue
                self.vocab.add_words(tokens)

                self.total_lines += 1
                if self.total_lines % self.verbose == 0:
                    self._logger.info('Read\t{0} lines.'.format(
                        self.total_lines))
        self.vocab.cut_by_freq(self.max_vocab_size)
        self._logger.info('Done building vocab and stats.')
Exemplo n.º 3
0
    def pro_tuple_data(self, out_file_name, batch_size=1):
        """Generate one tuple data, reads lines from file and encodes words.
        """
        if not out_file_name or not os.path.exists(out_file_name):
            return
        curr_sent = []
        with open(out_file_name) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                seg_line = seg_sentence(line, self.line_min_words)
                if not seg_line:
                    continue

                curr_sent.extend(filter(lambda x: x, seg_line))
                for data_iter in self.lines_curr_iterator(
                        curr_sent, batch_size=batch_size):
                    yield data_iter
                yield TextData.ONE_LINE_TOKEN
                curr_sent = []
    def post(self):
        self.set_header("Content-Type", "text/html")
        src = self.get_argument("message")
        view_similar = str(self.get_argument("view_similar"))
        simi_log.info(self.request.remote_ip + ', similar demo: ' + src)
        data = dict()
        data["top"] = 10
        data["src"] = src
        result = get_aysimi_demo(data["src"], data["top"])

        stopword_set = set([
            "年", "月", "日", "的", "了", "将", "诉称", "后", "于", "并", "但", "与", "元",
            "万元", "”", "、", "《", "》", ":", ";", ",", "。"
        ])
        src_word_list = jieba.cut(src, cut_all=False)
        src_word_set = set()
        digits = re.compile(r"\d+")
        for word in src_word_list:
            digit_match = re.match(digits, word)
            if digit_match:
                continue
            if word.encode() not in stopword_set:
                src_word_set.add(word)

        from matrixText.matrix_seg import seg_sentence
        src_list = seg_sentence(src)

        self.write(
            '<!DOCTYPE html>'
            '<html><head>'
            '<meta http-equiv="content-type" content="text/html;charset=utf-8">'
            '<title>相似案例结果</title>')
        self.write("</head>")
        self.write("<body>")
        self.write("<div>")

        self.write('<table border = "1">'
                   '<tr>'
                   '<th style="width: 40%">' + '源文' + '</th>'
                   '<th style="width: 60%">相似案例</th>'
                   '</tr>')

        separation = '<br>'
        i = 1
        for res_one in result:
            if "src" not in res_one.keys():
                continue

            if 'simi_line' == view_similar:

                self.write('<tr>')
                self.write('<td>')
                self.write("<p>" + str(i) + ":&nbsp;&nbsp;&nbsp;&nbsp;")
                self.write(separation)

                res_test_sym = list()
                res_norm_sym = list()
                for j in range(len(res_one['row_col'])):
                    res_test_sym.append(res_one['row_col'][j][0])
                    res_norm_sym.append(res_one['row_col'][j][1])

                for j in range(len(src_list)):
                    if j in res_test_sym:
                        if res_test_sym.index(j) == 0:
                            self.write("<b><font color=\"red\">" + '[ ' +
                                       str(j) + ' - ' + str(res_norm_sym[0]) +
                                       ' ]' + '<sub>' + str(0) + '</sub>' +
                                       ': ' + src_list[j] + "</font></b>")
                            self.write(separation)
                        elif res_test_sym.index(j) in [1, 2, 3]:
                            self.write(
                                "<b><font color=\"blue\">" + '[ ' + str(j) +
                                ' - ' +
                                str(res_norm_sym[res_test_sym.index(j)]) +
                                ' ]' + '<sub>' + str(res_test_sym.index(j)) +
                                '</sub>' + ': ' + src_list[j] + "</font></b>")
                            self.write(separation)
                        else:
                            self.write(
                                "<b><font color=\"olive\">" + '[ ' + str(j) +
                                ' - ' +
                                str(res_norm_sym[res_test_sym.index(j)]) +
                                ' ]' + '<sub>' + str(res_test_sym.index(j)) +
                                '</sub>' + ': ' + "</font></b>")
                            self.write(src_list[j])
                            self.write(separation)
                    elif src_list[j] and not re.compile(r'^\s*\n*$').match(
                            src_list[j]):
                        self.write('[' + str(j) + ']: ' + src_list[j])
                        self.write(separation)
                self.write('</td>')

                self.write('<td>')
                self.write("<p>" + str(i) + ":&nbsp;&nbsp;&nbsp;&nbsp;")
                self.write(separation)
                doc_list = seg_sentence(res_one["src"])

                for j in range(len(doc_list)):
                    if j in res_norm_sym:
                        if res_norm_sym.index(j) == 0:
                            self.write("<b><font color=\"red\">" + '[ ' +
                                       str(j) + ' - ' + str(res_test_sym[0]) +
                                       ' ]' + '<sub>' + str(0) + '</sub>' +
                                       ': ' + doc_list[j] + "</font></b>")
                            self.write(separation)
                        elif res_norm_sym.index(j) in [1, 2, 3]:
                            self.write(
                                "<b><font color=\"blue\">" + '[ ' + str(j) +
                                ' - ' +
                                str(res_test_sym[res_norm_sym.index(j)]) +
                                ' ]' + '<sub>' + str(res_norm_sym.index(j)) +
                                '</sub>' + ': ' + doc_list[j] + "</font></b>")
                            self.write(separation)
                        else:
                            self.write(
                                "<b><font color=\"olive\">" + '[ ' + str(j) +
                                ' - ' +
                                str(res_test_sym[res_norm_sym.index(j)]) +
                                ' ]' + '<sub>' + str(res_norm_sym.index(j)) +
                                '</sub>' + ': ' + "</font></b>")
                            self.write(doc_list[j])
                            self.write(separation)
                    elif doc_list[j] and not re.compile(r'^\s*\n*$').match(
                            doc_list[j]):
                        self.write('[' + str(j) + ']: ' + doc_list[j])
                        self.write(separation)

                self.write('</td>')
                self.write('</tr>')

            elif 'simi_word' == view_similar or view_similar == []:

                self.write('<tr>')
                self.write('<td>')
                self.write("<p>" + str(i) + ":&nbsp;&nbsp;&nbsp;&nbsp;")
                self.write(separation)

                self.write(str(src))
                self.write('</td>')

                self.write('<td>')
                self.write("<p>" + str(i) + ":&nbsp;&nbsp;&nbsp;&nbsp;")
                self.write(separation)

                target_word_list = jieba.cut(res_one["src"], cut_all=False)
                sameCount = 0
                sameWords = ""
                for word in target_word_list:
                    if word in src_word_set:
                        sameWords += word
                        sameCount += 1
                    else:
                        if sameCount >= 3:
                            self.write("<b><font color=\"green\">" +
                                       sameWords + "</font></b>")
                        else:
                            self.write(sameWords)
                        sameCount = 0
                        sameWords = ""
                        self.write(word)
                if sameCount > 0:
                    if sameCount >= 2:
                        self.write("<b><font color=\"green\">" + sameWords +
                                   "</font></b>")
                    else:
                        self.write(sameWords)

                self.write('</td>')
                self.write('</tr>')

            i += 1

        self.write('</table>')
        self.write("</div>")
        self.write("</body></html>")