示例#1
0
def replace_words_by_num(whole_words,file_dir,save_dir):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)
    filename = []
    def filter(s):
        if "all" in s:
            return True
        return False
    tools.get_filelist(file_dir,filename,filter)
    content = {}
    for file in filename:
        lines = tools.read_lines(file)
        string = ""
        for line in lines:
            words = line.split(" ")
            for word in words:
                if word.__len__()>0:
                    if word in whole_words.keys():
                        string+= str(whole_words[word])+" "
            string = string.strip()
            string+="\n"
        content[tools.get_name(file)] = string
        # print(string)
        # input()
    for name in content:
        savepath = save_dir+name+".txt"
        tools.write(savepath,content[name])
示例#2
0
def save_data(data, save_root):
    news_root = save_root + "/news/"
    abst_root = save_root + "/abstract/"
    for i in range(len(data)):
        fname = "trainning_" + str(i) + ".txt"

        ftools.write(abst_root + fname, data[i][0])
        ftools.write(news_root + fname, data[i][1])
示例#3
0
def get_result(dataname="cleandata_highquality_3500"):
    root = Dir.res + "/result/" + dataname + "/"
    flist = ftools.get_files(root)
    content = ""
    for name in flist:
        if ".txt" in name:
            continue
        lines = ftools.read_lines(root + name + "/eval_res.txt")
        content += name + ", " + lines[1][lines[1].index("[") +
                                          1:lines[1].index("]")] + "\n"
    print(content)
    ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
示例#4
0
def get_clue_words(path=Dir.res + "/extradata/",
                   savepath=Dir.res +
                   "/parameter/summarization_parameter/clue_words",
                   word_index=3):
    _, res_sen = filter(path)
    words = {}
    for var in res_sen:
        for sen in var[1:]:
            ws = tools.seperate(sen)
            for w in ws[:word_index]:
                if w not in words.keys():
                    words[w] = 0
                words[w] += 1

    content = ""
    for w in words.keys():
        content += w + "," + str(words[w]) + "\n"
    ftools.write(savepath + str(word_index), content)
示例#5
0
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"):
    ref_root = Dir.res + "/" + dataname + "/ref_processed/"
    abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/"
    detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt"
    filelist = ftools.get_files(ref_root)
    content = ""
    for i in range(len(filelist)):
        fname = filelist[i]
        print(i, len(filelist))
        abstract = ftools.read_lines(abs_root + fname)
        refence = ftools.read_lines(ref_root + fname)
        lines = [line.split(" ") for line in abstract]
        refen = [line.split(" ") for line in refence]
        rouge1 = rouge_1_simple(refen, lines)
        rouge2 = rouge_2_simple(refen, lines)
        print(fname, rouge1, rouge2)
        content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n"

    ftools.write(detail_path, content)
示例#6
0
    def craw_urls(self):
        start = 372
        for i in range(start, self.page_nums):
            request = Request.Request(self.url + str(i))
            for key in self.params.keys():
                request.add_header(key, self.params[key])
            response = Request.urlopen(request)

            html = response.read()
            html = html.decode('utf-8')
            infos = re.findall(self.url_regex, html)
            save_content = ""
            for info in infos:
                new_url = self.url_unqoate(info[-1])
                new_infor = [info[0], info[1], info[-1], new_url]
                save_content += self.seperator.join(new_infor) + "\n"

            tools.check_build_file(self.url_file)
            tools.write(self.url_file, content=save_content, mode="a")
            print(i, len(infos))
示例#7
0
def clean(data_dir=Dir.res + "/cleandata_8700/news/"):
    flist = tools.get_files(data_dir)
    # print(data_dir,len(flist))
    for fname in flist:
        flag = False
        content = tools.read(data_dir + fname)
        if "3805" in fname:
            print(content)
            input()

        if "您的浏览器不支持video标签\n" in content:
            content = content.replace("您的浏览器不支持video标签\n", "")
            flag = True
        if "新闻 专题 微博" in content:
            flag = True
            content = content[:content.index("新闻 专题 微博")]

        if flag:
            print(fname)
            tools.write(data_dir + fname, content)
示例#8
0
def build_word_index(file_dir,words_path):
    filename = []
    def filter(s):
        if "all" in s:
            return True
        return False
    tools.get_filelist(file_dir, filename, filter)
    whole_words = {}
    for file in filename:
        lines = tools.read_lines(file)
        for line in lines:
            words = list(jieba.cut(line))
            for word in words:
                if word.__len__() > 0:
                    if word not in whole_words.keys():
                        whole_words[word] = whole_words.__len__()
    word_index = ""
    for word in whole_words.keys():
        word_index += word + ":" + str(whole_words[word]) + "\n"
    tools.write(words_path, word_index)
    return whole_words
示例#9
0
    def craw_url(self,page_index,save_path):
        url = "http://weibo.cn/breakingnews?page=" + str(page_index)
        header = ['Host', 'weibo.cn', 'User-Agent',
                  'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept',
                  'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language',
                  'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Cookie',
                  'SCF=Ah6oK9ne4mmUNoYw4kUuNRmslSDJZqMC8SFA5i4tUHBOxdAcSzsIBEEOZfx3fQNj0BgpLdQSDXoBtnymKFxl8KA.; SUHB=0z1B6sSFzJ07wI; _T_WM=7fe561e14961c07e54388eb18a1b0902; SUB=_2A2502RrGDeRhGedG6loS-SbLzzuIHXVUJaaOrDV6PUJbkdANLVmtkW0WkE6llUm_KXMeRq22wEZ0nvVBRQ..; SSOLoginState=1507682966',
                  'DNT', '1', 'Connection', 'keep-alive', 'Upgrade-Insecure-Requests', '1']
        request = Request.Request(url)
        params = {}
        for i in range(header.__len__() - 1):
            if i % 2 == 0:
                request.add_header(header[i], header[i + 1])
                params[header[i]] = header[i + 1]
        response = Request.urlopen(request)
        html = response.read()
        html = html.decode('utf-8')
        regex = "【(.*?)】(.*?)<a href=\"(http.*?)[\u4e00-\u9fa5]*?\""
        infos = re.findall(regex, html)
        save_content = ""
        for info in infos:
            new_url = self.url_unqoate(info[-1])
            reheader = requests.head(new_url).headers
            if "Location" in reheader:
                reurl = reheader["Location"]
            else:
                reurl = new_url
            if "pic" in reurl or "vedio" in reurl:
                continue
            new_infor = [info[0], info[1],reurl]

            save_content+= '\t'.join(new_infor)+"\n"

        tools.check_build_file(save_path)
        tools.write(save_path,content=save_content,mode="a")
        return len(infos)
示例#10
0
    def extract_info(self, string, index):
        regex = "【(.*?)】(.*?)<a href=\"(http.*?)[\u4e00-\u9fa5]*?\""
        infos = re.findall(regex, string)
        result = []
        un_crawe_info = []
        try_count = 3
        for info in infos:
            if info.__len__() == 3:
                all_url = info[-1]
                if "amp;" in all_url:
                    all_url = all_url.replace("amp;", "")
                # print(all_url)
                content = self.url(all_url)
                try_time = 0
                if content != None:
                    content = content.replace("\n", "")
                if (content == None
                        or content.__len__() == 0) and try_time < try_count:
                    while (try_time < try_count):
                        time.sleep(0.1)
                        content = self.url(all_url)
                        # print("try",try_time)
                        if content != None:
                            content = content.replace("\n", "")
                        if content != None and content.__len__() > 0:
                            break
                        try_time += 1
                # print(try_time,all_url,info[0])
                if content != None:
                    output_string = content.replace("\n", "")
                    # print("hahah",output_string.__len__())
                if content != None and content.__len__() > 0:
                    tmp = []
                    for i in range(2):
                        tmp.append(info[i])

                    content_try, content_try_time = 3, 0
                    origin = self.get_origin_text(content)
                    origin = origin.replace("\n", "")
                    # print("result",origin)
                    if origin.__len__() < 100:
                        # print("s-fail",origin)
                        if index not in self.fail_pages:
                            un_tmp = []
                            for info_un in tmp:
                                un_tmp.append(info_un)
                                # un_crawe_info.append(info_un)
                            un_tmp.append(info[-1])
                            un_crawe_info.append(un_tmp)
                        continue
                    tmp.append(origin)
                    result.append(tmp)
        string = ""
        count = 0

        if un_crawe_info.__len__() > 0 and index not in self.fail_pages:
            print("fail", un_crawe_info.__len__())
            fail_info = ""
            for tmp_fail in un_crawe_info:
                fail_info += str(tmp_fail) + "\n"
            tools.write(self.canot_crawe_info + "page" + str(index),
                        fail_info,
                        mode="w")
            self.fail_pages.add(index)

        for info in result:
            count += 1
            # print("text",count)
            # print(str(info))
            string += str(info) + "\n"
        if string.__len__() > 0:
            tools.write(self.craw_result + "page" + str(index), string)
        return count, infos.__len__()
示例#11
0
    def evaluator_rouge(self,model,result_dir,num):
        summarize_result={}
        astart = time.time()
        ### 保存模型的摘要结果
        abstract = result_dir+"/abstract/"
        keys = sorted(self.data.keys())
        if self.parall:
            p = multiprocessing.Pool(self.cpu)
            inter = int(len(keys) / self.cpu) + 1
            args = []
            for i in range(self.cpu):
                tmp = {}
                if i == 0:
                    key = keys[:inter]
                    # print(i,"0",inter,len(key))
                    for k in key:
                        tmp[k] = self.data[k]
                elif i == self.cpu-1:

                    key = keys[i*inter:]
                    # print(i, i * inter, "end", len(key))
                    for k in key:
                        tmp[k] = self.data[k]
                else:

                    key = keys[i*inter:(i+1)*inter]
                    # print(i, i * inter, (i + 1) * inter, len(key))
                    for k in key:
                        tmp[k] = self.data[k]
                args.append({"n":"work"+str(i),
                            "d":tmp,
                            "m":model,
                             "a" :abstract
                             }
                            )
            # input()
            rslt = p.map(workers,args)
            # for var in rslt:
            #     for k in var.keys():
            #         summarize_result[k] = var[k]
            ### 处理摘要结果数据(数值化)
            # print("saving abstract ",len(summarize_result))
            # for fname in summarize_result.keys():
                # tools.write_list(abstract + fname + ".txt", summarize_result[fname])
            abstract_processed = result_dir + "/abstract_processed/"
            abstract_seperate = result_dir + "/abstract_seperate/"
            RP.result_process(abstract, abstract_seperate)
            print("abstract separate done")
            RP.replace_words_by_num(self.word_index, abstract_seperate, abstract_processed)
            print("abstract replace done")
            # print(abstract_processed,result_dir)
            self.rouge_detail(abstract_processed, result_dir)

            ### 计算 ROUGE
            # import src.evaluation.ROUGE
            # self.rouge = src.evaluation.ROUGE.ROUGE()
            # print("evaling")
            result = self.rouge.eval(abstract_processed, self.ref_processed,num)
            eval_result = result_dir + "/eval_res.txt"
            print(result)
            tools.write(eval_result, model.info + "\n" + result, mode="a")
            aend = time.time()
            print(aend - astart)

        else:
            count = 0
            for text in keys:
                if text not in summarize_result.keys():
                    start = time.time()
                    count+=1
                    # if count <1530:
                    #     count+=1
                    #     continue
                    # print(text)
                    summarize_result[text] = model.summarize(self.data[text], num,fname = text)


                    end = time.time()
                    # tools.print_proccess(count, len(self.data.keys()))
                    print(text,count,"/",len(keys),end-start)
                    # print(result_save_dir_abstract + text + ".txt")
                    # print(  model.summarize(self.data[text], num) )
                    tools.write_list(abstract + text + ".txt", summarize_result[text])
            ### 处理摘要结果数据(数值化)
            abstract_processed = result_dir+"/abstract_processed/"
            abstract_seperate = result_dir + "/abstract_seperate/"
            RP.result_process(abstract,abstract_seperate)
            RP.replace_words_by_num(self.word_index,abstract_seperate,abstract_processed)
            # print(abstract_processed,result_dir)
            self.rouge_detail(abstract_processed,result_dir)

            ### 计算 ROUGE
            # import src.evaluation.ROUGE
            # self.rouge = src.evaluation.ROUGE.ROUGE()
            # print("evaling")
            result = self.rouge.eval(abstract_processed, self.ref_processed)
            eval_result = result_dir+"/eval_res.txt"
            print(result)
            tools.write(eval_result,model.info+"\n"+result,mode="a")
            aend = time.time()
            print(aend-astart)
示例#12
0
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/",
                     save_dir=Dir.res + "/cleandata_none"):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)

    files = tools.get_files(data_dir)
    cleandata = []
    count = 0
    bad_sample = []
    for i in range(len(files)):
        print(i, len(files), len(cleandata))
        fname = files[i]
        path = data_dir + fname
        lines = tools.read_lines(path)
        for line in lines:
            line = line.strip()

            # try:
            if 1:
                last_ = line.rindex(",")
                first_ = line.index(",")
                if first_ == last_:
                    continue
                tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]]
                abstracts = tls.seperate_sentences(tmp[1])
                news = tls.seperate_sentences(tmp[2])

                tmp = get_abstract_index(news, abstracts)

                count += 1
                if len(tmp) != len(abstracts):
                    continue
                # print(tmp)
                # cmd = input()
                # if "1" in cmd:
                #     print('\n'.join(abstracts))
                #     print("--------------------")
                #     print('\n'.join(news))
                #
                #     print("--------------------")
                #     print("words:",w_count)
                w_count = 0
                for li in news:
                    w_count += len(tls.seperate(li))
                if w_count < 520:
                    continue

                if sum(tmp[:3]) <= 3:
                    continue
                cleandata.append([abstracts, news])
                tools.write(
                    save_dir + "/abstract/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(abstracts))
                tools.write(
                    save_dir + "/news/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(news))
            # except Exception as e:
            #     print(str(e),e.with_traceback(e.__traceback__))
            #     print("error",line)
            #     bad_sample.append(line)
    print(count, len(bad_sample), len(cleandata))