예제 #1
0
    def __cal_matrix(self, file_name=""):
        log.info("extract feature from pre-defined setting!")
        if self.__feature_method == "QD":
            paper_len = len(self.__paper)
            matrix_l = np.zeros([paper_len, paper_len])
            for i in range(paper_len):
                for j in range(paper_len):
                    # print "element %d, %d" % (i, j)
                    num = self.__cal_matrix_element(i, j)
                    matrix_l[i][j] = num
                    matrix_l[j][i] = num
        elif self.__feature_method == "DM":
            #            if file_name == "":
            #                log.error("file name is empty, please check!")
            #                return []
            #            file_path = os.path.join("./data/word2vec/remove_stop/%s.vec" % file_name)
            file_path = self.__child_path + "word_segment.vec"
            matrix_l = self.__get_doc2vec_matrix(file_path)
        else:
            log.error("self.__feature_method is " + self.__feature_method)
            return []
#        matrix_l = self.__feature_normalization(matrix_l)
        if self.summary_method == "hDPP":
            self.__doc_matrix_ = matrix_l
        return matrix_l
예제 #2
0
def write_file(file_conent, file_path, if_convert=True):
    try:
        output = open(file_path, 'w')
    except IOError as e:
        log.error(e)
        exit()
    finally:
        for cur_line in file_conent:
            new_line = (cur_line + u"\n").encode("utf-8")
            # if if_convert:
            #     output.write(new_line.decode('utf-8').encode('GB2312', 'ignore'))
            # else:
            #     output.write(new_line)
            output.write(new_line)
        output.close()
예제 #3
0
    def get_mss_paper_summary(self, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""

        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]

        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #print len(feature_subset)
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig)

        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            #print self.__rouge_path
            answer_path = self.__child_path
            write_file(
                summary,
                os.path.join('%s/%s.txt' %
                             (answer_path, file_name + '_result')), False)

        return "".join(summary)
예제 #4
0
def read_file(file_path):
    file_ans = list()
    try:
        file_object_ = open(file_path, 'rb')
    except IOError as e:
        log.error(e)
    else:
        try:
            file_content = file_object_.read()
            file_content = __pattern.sub('\n', file_content)
            for cur_line in file_content.split('\n'):
                if cur_line == "":
                    continue
                file_ans.append(cur_line.strip().decode("utf-8"))
        finally:
            file_object_.close()
    return file_ans
예제 #5
0
    def __cal_candidate_set(self):
        matrix_l = self.__cal_matrix()
        subset_ = []
        eigenvalue = []
        #print self.candidate_method
        try:
            if self.candidate_method == "DR":
                subset_, eigenvalue = ds.sample(matrix_l)
                #print len(subset_)

            elif self.candidate_method == "CLU-DPP":
                cluster = hlda_analysis.sentence_cluster(
                    self.__child_path, "run000")
                # debug hLDA message, include: total cluster number, each cluster sentence,
                i = 0
                tmp = ""
                log.debug("cluster number: " + str(len(cluster)))
                for sen_list in cluster:
                    tmp += "\n cluster: " + str(
                        i) + "\tsentence_num is " + str(len(sen_list)) + "\n"
                    tmp += "\n".join(np.array(self.__paper_original)[sen_list])
                    i += 1
                log.debug(tmp)
                # begin calculate and get sentence
                for i in range(len(cluster) / 2):
                    sen_list = cluster[i]
                    tmp_matrix = matrix_l[sen_list][:, sen_list]
                    tmp_set, eig = ds.sample(tmp_matrix)
                    if len(sen_list) < 10:
                        subset_.append(sen_list)
                        eigenvalue.append(eig)
                        continue
                    subset_.append(np.array(sen_list)[tmp_set].tolist())
                    eigenvalue.append(np.array(eig)[tmp_set].tolist())
            elif self.candidate_method == "RANDOM":
                for i in range(20):
                    subset_.append(
                        np.random.randint(0, len(self.__paper_original)))
            else:
                raise RuntimeError("value error: " + self.candidate_method)
        except RuntimeError as e:
            log.error(e)
        finally:
            return subset_, eigenvalue
예제 #6
0
 def __quality_calculating(self, idx):
     """
     calculate quality using different methods
     :param idx: index of matrix element
     :return:
     """
     if self.__paper is None:
         log.error("")
         sys.exit()
     if self.__quality is not None:
         return self.__quality[idx]
     self.__quality = np.zeros([len(self.__paper)])
     self.__quality_initial_length()
     self.__quality_initial_coverage()
     self.__quality_initial_position()
     self.__quality_initial_level()
     self.__quality_initial_similarity()
     #        self.__quality /= 2.0
     return self.__quality[idx]
예제 #7
0
    def __cal_matrix(self, file_name=""):
        log.info("extract feature from pre-defined setting!")
        if self.__feature_method == "QD":
            paper_len = len(self.__paper)
            matrix_l = np.zeros([paper_len, paper_len])
            for i in range(paper_len):
                for j in range(paper_len):
                    if i > j:
                        continue
                    num = self.__cal_matrix_element(i, j)
                    matrix_l[i][j] = num
                    matrix_l[j][i] = num

        elif self.__feature_method == "DM":
            file_path = self.__child_path + "word_segment.vec"
            matrix_l = self.__get_doc2vec_matrix(file_path)
        else:
            log.error("self.__feature_method is " + self.__feature_method)
            return []
#        matrix_l = self.__feature_normalization(matrix_l)
        if self.summary_method == "hDPP":
            self.__doc_matrix_ = matrix_l
        return matrix_l
예제 #8
0
    def get_mss_paper_summary(self, lang, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""
        '''
        if DATA == "mms2015":
            self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese")
        elif DATA == "mss2017":
            if lang in ["vi", "ka"]:
                self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path))
            else:
                self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path))
        self.__paper_original = self.__all_file.get_merged_paper()
        if self.stop_word_method == "remove_stop":
            self.__paper = self.__all_file.get_filtered_paper()
        elif self.stop_word_method == "with_stop":
            self.__paper = self.__all_file.get_merged_paper()
        self.__titles = self.__all_file.get_titles()
        # used for generate hLDA input file and calculate level method.
        if (not os.path.exists(self.__child_path + "model.temp")) or False:
            write_file(self.__paper, self.__child_path + "RemoveStop.temp", False)
            write_file(self.__paper_original, self.__child_path + "word_segment.temp", False)
            model_temp(self.__paper, self.__child_path)
            return ""
        '''
        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]
        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.error("results is: ")
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig, lang)
        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            answer_path = self.__rouge_path + lang + "/systems/"
            write_file(summary,
                       os.path.join('%s%s.txt' % (answer_path, file_name)),
                       False)
            '''
            # generate gold summary split by CHAR
            gold_path = self.__rouge_path + lang + "/models/"
            if not os.path.exists(gold_path):
                os.makedirs(gold_path)
            tmp_name = lang + "/" + file_name + "_summary.txt"
            abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name)
            if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka':
                write_file([" ".join(api.tokenize("\n".join(abs_human)))],
                           gold_path + file_name + "_summary.txt", False)
            if lang == "vi":
                write_file(abs_human, gold_path + file_name + "_summary.txt", False)
            # generate configure file of each document for ROUGE
            conf_path = self.__rouge_path + lang + "/configure/"
            if not os.path.exists(conf_path):
                os.makedirs(conf_path)
            tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt"
            self.__all_conf.append(tmp_conf_)
            write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False)
            '''

        return "".join(summary)