示例#1
0
def get_lda_input_format():
    file_content = read_file(
        "..\\data\\sample_datas\\evasampledata4-TaskAA.txt")
    lda_format = list()
    cur_target = ""
    count = 0
    for cur_line in file_content:
        if cur_line == "":
            continue
        (id_, target, text, stance) = cur_line.split("\t")
        if cur_target == "":
            cur_target = target
            count = 0
        if cur_target != target:
            cur_target = target
            count = 0
        if count < 500:
            lda_format.append(text)
            count += 1


#        if target != "IphoneSE":
#            continue
#        lda_format.append(" ".join(jieba.cut(text)))
    write_file(lda_format, "../nlpcc2016.txt")
示例#2
0
def __processing_using_ros(original_path="../data/origin",
                           data_backup_path="../data/ros_result"):
    px = ParseXML()
    for cur_file in os.listdir(original_path):
        dir_name = cur_file.split(".")[0]
        if dir_name == "test":
            continue
        out_path = os.path.join("%s/%s/" % (data_backup_path, dir_name))
        if os.path.exists(out_path + "/tokenized_body.temp") and \
                os.path.exists(out_path + "/tokenized_title.temp") and False:
            continue
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        #px.parse(os.path.join("%s" % (cur_file)))
        # cur_content = read_file(os.path.join("%s/%s" % (lang_path, cur_file)))
        contents = px.all_content
        print contents
        titles = px.title
        word_segmented = api.tokenize(u" ".join(contents))
        write_file(contents, os.path.join("%s/%s.txt" % (out_path, dir_name)), False)
        write_file(word_segmented, out_path + "/tokenized_body.temp", False)
        write_file(word_segmented, out_path + "/lemmatized_body.temp", False)
        word_segmented = api.tokenize(u" ".join(titles))
        write_file(word_segmented, out_path + "/tokenized_title.temp", False)
        write_file(word_segmented, out_path + "/lemmatized_title.temp", False)
示例#3
0
def get_rouge_ans():
    memog_res = read_file("../data/2017_results/MSS2017_ROUGE_1.5.7_CI.csv")
    ori_res_value = np.array(
        [line.replace(" ", "").split(",")[4] for line in memog_res])
    ori_team = np.array(
        [line.replace(" ", "").split(",")[0] for line in memog_res])
    ori_priority = np.array(
        [line.replace(" ", "").split(",")[1] for line in memog_res])
    ori_lang = np.array(
        [line.replace(" ", "").split(",")[2] for line in memog_res])
    rouge_n = np.array(
        [line.replace(" ", "").split(",")[3] for line in memog_res])
    for cur_rouge in ["ROUGE-1", "ROUGE-2", "ROUGE-3", "ROUGE-4"]:
        ans = []
        tmp_set = set()
        rouge_idx = np.where(rouge_n == cur_rouge)[0]
        lang = ori_lang[rouge_idx]
        priority = ori_priority[rouge_idx]
        team = ori_team[rouge_idx]
        res_value = ori_res_value[rouge_idx]
        for cur_lang in set(lang):
            print cur_lang
            idx = np.where(lang == cur_lang)[0]
            cur_value = res_value[idx]
            cur_priority = priority[idx]
            cur_team = team[idx]
            tmp_ans = cur_lang + '\t'
            tmp_len = 0
            while tmp_len < len(cur_value):
                idx_min = np.where(cur_value == max(cur_value))[0]
                tmp_len += len(idx_min)
                for idx_1 in idx_min:
                    tmp_ans += cur_team[idx_1] + "-" + cur_priority[
                        idx_1] + '\t'
                cur_value[idx_min] = -1
            ans.append(tmp_ans)
        tmp_set = set([team[i] + "-" + priority[i] for i in range(len(team))])
        write_file(ans, "../" + cur_rouge + "_ans.txt")
        final_ans = []
        final_value = []
        tmp_set = sorted(list(tmp_set))
        final_ans.append("lang," + ",".join(tmp_set))
        for cur_ans in ans:
            ans_list = cur_ans.split('\t')
            print ans_list
            final_tmp_ans = ["" for i in range(len(tmp_set) + 1)]
            final_tmp_ans[0] = ans_list[0]
            for i in range(len(tmp_set)):
                if tmp_set[i] in ans_list:
                    final_tmp_ans[i + 1] = str(
                        np.where(np.array(ans_list) == tmp_set[i])[0][0])
            final_ans.append(",".join(final_tmp_ans))
        write_file(final_ans, "../final_" + cur_rouge + "_ans.csv")
    print "end"
示例#4
0
    def get_mss_paper_summary(self, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""

        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]

        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #print len(feature_subset)
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig)

        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            #print self.__rouge_path
            answer_path = self.__child_path
            write_file(
                summary,
                os.path.join('%s/%s.txt' %
                             (answer_path, file_name + '_result')), False)

        return "".join(summary)
def model_temp(content, out_path):
    hlda_model = []
    word_list = set(" ".join(content).split(" "))
    if "" in word_list:
        word_list.remove("")
    for sentence in content:
        sen_word = sentence.split(" ")
        union_word = set(sen_word)
        if "" in union_word:
            union_word.remove("")
        sen_model = str(len(union_word))
        for word in union_word:
            idx = np.where(np.array(list(word_list)) == word)
            sen_model += " " + str(idx[0][0]) + ":" + str(sen_word.count(word))
        hlda_model.append(sen_model)
    write_file(hlda_model, out_path + "/model.temp", False)
    write_file(word_list, out_path + "/words.temp", False)
示例#6
0
def get_memog_ans():
    memog_res = read_file("../data/2017_results/MSS2017_MeMoG_CI_March30.csv")
    res_value = np.array(
        [line.replace(" ", "").split(",")[4] for line in memog_res])
    team = np.array(
        [line.replace(" ", "").split(",")[0] for line in memog_res])
    priority = np.array(
        [line.replace(" ", "").split(",")[1] for line in memog_res])
    lang = np.array(
        [line.replace(" ", "").split(",")[2] for line in memog_res])
    ans = []
    memog_answer_value = []
    tmp_set = set()
    for cur_lang in set(lang):
        print cur_lang
        idx = np.where(lang == cur_lang)[0]
        cur_value = res_value[idx]
        cur_priority = priority[idx]
        cur_team = team[idx]
        tmp_ans = cur_lang + '\t'
        tmp_len = 0
        while tmp_len < len(cur_value):
            idx_max = np.where(cur_value == max(cur_value))[0]
            tmp_len += len(idx_max)
            for idx_1 in idx_max:
                tmp_ans += cur_team[idx_1] + "-" + cur_priority[idx_1] + '\t'
            cur_value[idx_max] = -1
        ans.append(tmp_ans)
    tmp_set = set([team[i] + "-" + priority[i] for i in range(len(team))])
    write_file(ans, "../memog_ans_march30.txt")
    final_ans = []
    tmp_set = sorted(list(tmp_set))
    final_ans.append("lang," + ",".join(tmp_set))
    for cur_ans in ans:
        ans_list = cur_ans.split('\t')
        print ans_list
        final_tmp_ans = ["" for i in range(len(tmp_set) + 1)]
        final_tmp_ans[0] = ans_list[0]
        for i in range(len(tmp_set)):
            if tmp_set[i] in ans_list:
                final_tmp_ans[i + 1] = str(
                    np.where(np.array(ans_list) == tmp_set[i])[0][0])
        final_ans.append(",".join(final_tmp_ans))
    write_file(final_ans, "../final_memog_ans_march30.csv")
    print "end"
示例#7
0
def get_rouge_value():
    memog_res = read_file("../data/2017_results/MSS2017_ROUGE_1.5.7_CI.csv")
    ori_res_value = np.array(
        [line.replace(" ", "").split(",")[4] for line in memog_res])
    ori_team_priority = np.array([
        line.replace(" ", "").split(",")[0] + "-" +
        line.replace(" ", "").split(",")[1] for line in memog_res
    ])
    ori_lang = np.array(
        [line.replace(" ", "").split(",")[2] for line in memog_res])
    rouge_n = np.array(
        [line.replace(" ", "").split(",")[3] for line in memog_res])
    for cur_rouge in ["ROUGE-1", "ROUGE-2", "ROUGE-3", "ROUGE-4"]:
        ans = []
        # get team priority
        tmp_team_priority = []
        rouge_idx = np.where(rouge_n == cur_rouge)[0]
        lang = ori_lang[rouge_idx]
        team_priority = ori_team_priority[rouge_idx]
        res_value = ori_res_value[rouge_idx]
        for cur_lang in set(lang):
            idx = np.where(lang == cur_lang)[0]
            if len(tmp_team_priority) < len(team_priority[idx]):
                tmp_team_priority = team_priority[idx]
        tmp_team_priority = sorted(tmp_team_priority)
        print tmp_team_priority
        print len(tmp_team_priority)

        ans.append("lang\t" + "\t".join(tmp_team_priority))
        for cur_lang in set(lang):
            print cur_lang
            tmp_ans = cur_lang
            idx = np.where(lang == cur_lang)[0]
            cur_lang_value = res_value[idx]
            print team_priority[idx]
            for cur_team_priority in tmp_team_priority:
                idx_1 = np.where(team_priority[idx] == cur_team_priority)[0]
                if len(idx_1) == 0:
                    tmp_ans += "\t-"
                    continue
                tmp_ans += "\t" + str(float(cur_lang_value[idx_1[0]]) * 100)
            ans.append(tmp_ans)
        write_file(ans, "../final_" + cur_rouge + "_value.csv")
    print "end"
示例#8
0
def analyse_rouge_value(file_path, rouge_n):
    log.info("analysing rouge result ...")
    analysed_log = []
    for cur_log_file in os.listdir(file_path):
        if not cur_log_file.endswith(".log"):
            continue
        log_content = read_file(
            os.path.join("%s/%s" % (file_path, cur_log_file)))
        file_name = os.path.basename(cur_log_file)
        tmp_log = ""
        for i in range(len(log_content)):
            if log_content[i].endswith("configure_all.out"):
                tmp_log = "all\t"
                tmp_log += "\t".join(
                    file_name.replace(".log", "").split(".")[3:])
                tmp_log += "\t" + log_content[i + 2].split(" ")[3]
                tmp_log += "\t" + log_content[i + 3].split(" ")[3]
                tmp_log += "\t" + log_content[i + 4].split(" ")[3]
                break
        analysed_log.append(tmp_log)
    write_file(analysed_log, "./data/log_analysis/ana.log", False)
示例#9
0
def get_memog_value():
    memog_res = read_file("../data/2017_results/MSS2017_MeMoG_CI_March30.csv")
    res_value = np.array(
        [line.replace(" ", "").split(",")[4] for line in memog_res])
    team_priority = np.array([
        line.replace(" ", "").split(",")[0] + "-" +
        line.replace(" ", "").split(",")[1] for line in memog_res
    ])
    lang = np.array(
        [line.replace(" ", "").split(",")[2] for line in memog_res])
    ans = []

    # get team priority
    tmp_team_priority = []
    for cur_lang in set(lang):
        idx = np.where(lang == cur_lang)[0]
        if len(tmp_team_priority) < len(team_priority[idx]):
            tmp_team_priority = team_priority[idx]
    tmp_team_priority = sorted(tmp_team_priority)
    print tmp_team_priority
    print len(tmp_team_priority)

    ans.append("lang\t" + "\t".join(tmp_team_priority))
    for cur_lang in set(lang):
        print cur_lang
        tmp_ans = cur_lang
        idx = np.where(lang == cur_lang)[0]
        cur_lang_value = res_value[idx]
        print team_priority[idx]
        for cur_team_priority in tmp_team_priority:
            idx_1 = np.where(team_priority[idx] == cur_team_priority)[0]
            if len(idx_1) == 0:
                tmp_ans += "\t-"
                continue
            tmp_ans += "\t" + str(float(cur_lang_value[idx_1[0]]) * 100)
        ans.append(tmp_ans)
    write_file(ans, "../final_memog_value_march30.csv")
    print "end"
示例#10
0
    def get_mss_paper_summary(self, lang, file_name, if_write_file=True):
        """
        generate summary for one paper, single document summarization
        :param lang:
        :param file_name: current file name, used for write summary answer
        :param if_write_file: whether write generated summary to answer file named file_name
        :return:
        """
        # initial
        self.__quality, self.__paper_name = None, file_name
        self.quality_method__ = ""
        '''
        if DATA == "mms2015":
            self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese")
        elif DATA == "mss2017":
            if lang in ["vi", "ka"]:
                self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path))
            else:
                self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path))
        self.__paper_original = self.__all_file.get_merged_paper()
        if self.stop_word_method == "remove_stop":
            self.__paper = self.__all_file.get_filtered_paper()
        elif self.stop_word_method == "with_stop":
            self.__paper = self.__all_file.get_merged_paper()
        self.__titles = self.__all_file.get_titles()
        # used for generate hLDA input file and calculate level method.
        if (not os.path.exists(self.__child_path + "model.temp")) or False:
            write_file(self.__paper, self.__child_path + "RemoveStop.temp", False)
            write_file(self.__paper_original, self.__child_path + "word_segment.temp", False)
            model_temp(self.__paper, self.__child_path)
            return ""
        '''
        if self.stop_word_method == "remove_stop":
            self.__paper = read_file(self.__child_path + "RemoveStop.temp")
        elif self.stop_word_method == "with_stop":
            self.__paper = read_file(self.__child_path + "word_segment.temp")
        self.__titles = read_file(self.__child_path + "titles.temp")
        self.__paper_original = read_file(self.__child_path +
                                          "word_segment.temp")
        self.__sub_paper_len = [
            int(i) for i in read_file(self.__child_path + "sec_idx.temp")
        ]
        # extract sentence
        feature_subset, eig = self.__cal_candidate_set()
        #        feature_subset = range(len(self.__paper_original))
        #        eig = []
        log.error("results is: ")
        log.info(feature_subset)
        log.debug(eig)
        # use feature list to extract summary
        summary = self.__construct_summary(feature_subset, eig, lang)
        if if_write_file:
            if file_name == '':
                log.error("file name is empty")
                return ""
            # write answer to file for ROUGE
            answer_path = self.__rouge_path + lang + "/systems/"
            write_file(summary,
                       os.path.join('%s%s.txt' % (answer_path, file_name)),
                       False)
            '''
            # generate gold summary split by CHAR
            gold_path = self.__rouge_path + lang + "/models/"
            if not os.path.exists(gold_path):
                os.makedirs(gold_path)
            tmp_name = lang + "/" + file_name + "_summary.txt"
            abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name)
            if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka':
                write_file([" ".join(api.tokenize("\n".join(abs_human)))],
                           gold_path + file_name + "_summary.txt", False)
            if lang == "vi":
                write_file(abs_human, gold_path + file_name + "_summary.txt", False)
            # generate configure file of each document for ROUGE
            conf_path = self.__rouge_path + lang + "/configure/"
            if not os.path.exists(conf_path):
                os.makedirs(conf_path)
            tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt"
            self.__all_conf.append(tmp_conf_)
            write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False)
            '''

        return "".join(summary)
示例#11
0
def ini_mss2015_data(root_path, out_path):
    #__processing_using_ros()
    #__processing_using_nltk()
    os.makedirs(out_path)
    #__stop_word = read_file("./third_part/dict/stop_list.txt")
    for cur_file in os.listdir(root_path):
        out_dir_name = cur_file
        out_dir_path = os.path.join("%s/%s" % (out_path, out_dir_name))
        os.mkdir(out_dir_path)
        content = read_file(root_path + "/" + cur_file + "/" + cur_file + ".txt")
        write_file(content, out_dir_path + "/" + out_dir_name + ".txt", False)
        # start generate temp file
        tokenized_paper = read_file(root_path + "/" + cur_file + "/lemmatized_body.temp")
        remove_stop = []
        segmented_paper = []
        no_bracket_str = []
        section_set = []
        tmp_str = ""
        tmp_removed_str = ""
        tmp_no_bracket_str = ""
        __brackets = False
        tmp_int = 0
        for word in tokenized_paper:
            if word == "(" or word == u"(":
                __brackets = True
            elif word == ")" or word == u")":
                __brackets = False
            #if word not in __stop_word:
            tmp_removed_str += word + " "
            if __brackets:
                tmp_str += word + " "
                continue
            if word != "#":
                tmp_no_bracket_str += word + " "
                tmp_str += word + " "
            if word.endswith(".") or word in [u"。", u"!", u"?", u";", u"#"]:
                if tmp_removed_str != "":
                    segmented_paper.append(tmp_str)
                    remove_stop.append(tmp_removed_str)
                    no_bracket_str.append(tmp_no_bracket_str)
                    tmp_int += 1
                if word == "#":
                    section_set.append(str(tmp_int - 1))
                tmp_str = ""
                tmp_removed_str = ""
                tmp_no_bracket_str = ""
        section_set.append(str(len(segmented_paper)))
        write_file(remove_stop, out_dir_path + "/RemoveStop.temp", False)
        write_file(segmented_paper, out_dir_path + "/word_segment.temp", False)
        write_file(no_bracket_str, out_dir_path + "/word_remove_bracket.temp", False)
        titles = read_file(root_path + "/" + cur_file + "/tokenized_title.temp")
        write_file([" ".join(titles)], out_dir_path + "/titles.temp", False)
        write_file(section_set, out_dir_path + "/sec_idx.temp", False)
        model_temp(segmented_paper, out_dir_path)

    return ""
示例#12
0
def __processing_using_nltk(original_path,
                            data_backup_path):
    px = ParseXML()
    for cur_file in os.listdir(original_path):
        dir_name = cur_file.split(".")[0]
        if dir_name == "test":
            continue
        out_path = os.path.join("%s/%s/" % (data_backup_path, dir_name))
        if os.path.exists(out_path + "/tokenized_body.temp") and \
                os.path.exists(out_path + "/tokenized_title.temp") and False:
            continue
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        #px.parse(os.path.join("%s/%s" % (original_path, cur_file)))
        cur_content = read_file(os.path.join("%s/%s" % (original_path, cur_file)))
        contents = cur_content
        #print contents
        titles = cur_content[0]
        print titles
        word_segmented = word_tokenize(" ".join(contents))
        write_file(contents, os.path.join("%s/%s.txt" % (out_path, dir_name)), False)
        write_file(word_segmented, out_path + "/tokenized_paper.temp", False)
        write_file(word_segmented, out_path + "/lemmatized_paper.temp", False)
        write_file(word_tokenize(" ".join(titles)), out_path + "/tokenized_title.temp", False)
        write_file(word_tokenize(" ".join(titles)), out_path + "/lemmatized_title.temp", False)
        write_file(word_tokenize(" ".join(cur_content)), out_path + "/tokenized_body.temp", False)
        write_file(word_tokenize(" ".join(cur_content)), out_path + "/lemmatized_body.temp", False)