Пример #1
0
def craw_result_process(root=Dir.res + "/data/"):
    files = ftools.get_files(root)
    data = []
    for i in range(len(files)):
        filename = files[i]
        if len(data) > 10:
            break
        lines = ftools.read_lines(root + filename)
        for line in lines:
            tmp = line.split(",")
            # print("news",len(tmp[2]))
            # print("news",tmp[2])
            #
            # print("abstract",len(tmp[1]))
            # print("abstract",tmp[1])

            abstract = tools.seperate_sentences(tmp[1])
            news = tools.seperate_sentences(tmp[2])
            print(abstract)
            print(news)
            # input()
            jude = data_filter(news, abstract)
            if jude > 0.5:
                data.append(['\n'.join(abstract), '\n'.join(news)])
    return data
Пример #2
0
 def __init__(self):
     self.auto = AutoCoder()
     self.name = "fast encoder"
     self.data ={}
     path = Dir.res+"/encoder/cleandata_8700/"
     fllist = ftools.get_files(path)
     for name in fllist:
         self.data[name] = tools.load_object(path+name)
Пример #3
0
 def load(self):
     path = Dir.res + "/cleandata_highquality_1640/abstract/"
     for name in ftools.get_files(path):
         tmp = ftools.read_lines(path + name)
         self.answer[name] = []
         for var in tmp:
             if len(var.strip()) <= 5:
                 continue
             self.answer[name].append(var)
Пример #4
0
 def rouge_detail(self,abstract_processed,save_dir):
     flist = tools.get_files(abstract_processed)
     save_content = []
     for fname in flist:
         content = tools.read_lines(abstract_processed+fname)
         refence = tools.read_lines(self.ref_processed+fname)
         lines =[line.split(" ") for line in content]
         refen =[line.split(" ") for line in refence]
         rouge1 = self.rouge_1_simple(refen,lines)
         rouge2 = self.rouge_2_simple(refen, lines)
         save_content.append(fname+","+str(rouge1)+","+str(rouge2))
     tools.write_list(save_dir+"/detials.txt",save_content)
Пример #5
0
def analyze(main_name, compare_index, name="cleandata_small"):
    save_path = Dir.res + "/result/judge.txt"
    jude_dict = tools.load_object(save_path)
    # print(list(jude_dict.keys())[0])

    print(len(jude_dict))

    entry_path = Dir.res + "/result/" + name + "/EntryBigraph/detials.txt"
    entry_data = load_data(entry_path)
    first_path = Dir.res + "/result/" + name + "/" + main_name + "/detials.txt"
    first_data = load_data(first_path)
    textrank_path = Dir.res + "/result/" + name + "/TextRank/detials.txt"
    tr_data = load_data(textrank_path)
    result = {}
    for key in first_data.keys():
        a = first_data[key][0] - entry_data[key][0]
        b = first_data[key][1] - entry_data[key][1]
        c = first_data[key][0] - tr_data[key][0]
        d = first_data[key][1] - tr_data[key][1]
        e = first_data[key][0] - tr_data[key][0] + entry_data[key][
            0] - tr_data[key][0]
        f = first_data[key][1] - tr_data[key][1] + entry_data[key][
            1] - tr_data[key][1]
        result[key] = [a, b, c, d, e, f]
    count = 0
    news_root = Dir.res + "/" + name + "/news/"
    abst_root = Dir.res + "/" + name + "/abstract/"
    fname = ftools.get_files(news_root)
    new_result = {}
    for filename in fname:
        # print(filename,count,len(fname))
        # news = ftools.read_lines(news_root+filename)
        # weibo = ftools.read_lines(abst_root+filename)
        # jude = data_filter(news,weibo)
        # jude_dict[filename] = jude
        jude = jude_dict[filename]
        if jude > 0.5:
            new_result[filename] = result[filename]
            new_result[filename].append(jude)
            count += 1
    tools.save_object(jude_dict, Dir.res + "/result/judge.txt")
    tmp = dict(
        sorted(new_result.items(),
               key=lambda d: d[1][compare_index],
               reverse=True))
    save_dict = {}
    names = []
    for key in tmp.keys():
        save_dict[key] = tmp[key]
        names.append(key)
    save_path = Dir.res + "/result/" + name + "/" + main_name + ".txt"
    ftools.write_com_dict(save_path, save_dict)
    return names
Пример #6
0
def get_result(dataname="cleandata_highquality_3500"):
    root = Dir.res + "/result/" + dataname + "/"
    flist = ftools.get_files(root)
    content = ""
    for name in flist:
        if ".txt" in name:
            continue
        lines = ftools.read_lines(root + name + "/eval_res.txt")
        content += name + ", " + lines[1][lines[1].index("[") +
                                          1:lines[1].index("]")] + "\n"
    print(content)
    ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
Пример #7
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         for line in lines:
             words = tools.seperate(line)
             data.append(TaggedDocument(words, ["sen_" + str(count)]))
             self.sen_dict[''.join(words)] = "sen_" + str(count)
             count += 1
     return data
Пример #8
0
    def indexlize_data(self,reprocess):
        ###  建立词到数值的映射
        print("start")
        word_index_path = self.dir_path + "/words_index.txt"
        if not tools.isexists(word_index_path) or \
                not tools.isexists(self.ref_processed) or \
                not tools.isexists(self.ref_seperate) or \
                len(tools.get_files(self.ref_seperate)) ==0 or \
                len(tools.get_files(self.ref_processed)) == 0:
            reprocess = True
        if reprocess:

            self.word_index = RP.build_word_index(self.file, word_index_path)
            # print("word_index_builded")
            ###  参考摘要数值化
            print(self.file_ref,self.ref_seperate,self.ref_processed)
            RP.result_process(self.file_ref, self.ref_seperate)
            RP.replace_words_by_num(self.word_index, self.ref_seperate, self.ref_processed)
            print("references process done")
        else:
            self.load_word_index(word_index_path )
            print("word index loaded")
Пример #9
0
def generate_new_data():
    npath = Dir.res + "/cleandata_highquality_3500/news/"
    # apath = Dir.res+"/cleandata_highquality_3500/abstract/"

    new_npath = Dir.res + "/cleandata_highquality_3500_new/news/"
    new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/"

    uper = Uper()

    for name in ftools.get_files(npath):
        path = npath + name
        content = ftools.read_lines(path)
        new_abstract = uper.summarize(content, num=3, fname=name[:-4])
        ftools.copy(path, new_npath + name)
        ftools.write_list(new_apath + name, new_abstract)
Пример #10
0
def get_small_data():
    root = Dir.res + "/cleandata_8700/"
    saveroot = Dir.res + "/cleandata_small/"

    flist = ftools.get_files(root + "news/")
    count = 0
    for i in range(len(flist)):
        name = flist[i]
        content = ftools.read_lines(root + "news/" + name)
        if len(content) < 80:
            print(count, i, len(flist))
            ftools.copy(root + "news/" + name, saveroot + "news/" + name)
            ftools.copy(root + "abstract/" + name,
                        saveroot + "abstract/" + name)
            count += 1
Пример #11
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         essay = ""
         tmp = []
         for line in lines:
             words = tools.seperate(line)
             tmp.extend(words)
             essay += ''.join(words)
         data.append(TaggedDocument(tmp, ["text_" + str(count)]))
         self.sen_dict[essay] = "text_" + str(count)
         count += 1
     return data
Пример #12
0
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"):
    ref_root = Dir.res + "/" + dataname + "/ref_processed/"
    abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/"
    detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt"
    filelist = ftools.get_files(ref_root)
    content = ""
    for i in range(len(filelist)):
        fname = filelist[i]
        print(i, len(filelist))
        abstract = ftools.read_lines(abs_root + fname)
        refence = ftools.read_lines(ref_root + fname)
        lines = [line.split(" ") for line in abstract]
        refen = [line.split(" ") for line in refence]
        rouge1 = rouge_1_simple(refen, lines)
        rouge2 = rouge_2_simple(refen, lines)
        print(fname, rouge1, rouge2)
        content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n"

    ftools.write(detail_path, content)
Пример #13
0
def clean(data_dir=Dir.res + "/cleandata_8700/news/"):
    flist = tools.get_files(data_dir)
    # print(data_dir,len(flist))
    for fname in flist:
        flag = False
        content = tools.read(data_dir + fname)
        if "3805" in fname:
            print(content)
            input()

        if "您的浏览器不支持video标签\n" in content:
            content = content.replace("您的浏览器不支持video标签\n", "")
            flag = True
        if "新闻 专题 微博" in content:
            flag = True
            content = content[:content.index("新闻 专题 微博")]

        if flag:
            print(fname)
            tools.write(data_dir + fname, content)
Пример #14
0
def fill_all(path = Dir.res+"/craw_data/original/",save_path = Dir.res+"/craw_data/data/",fail_save_path = Dir.res+"/craw_data/fail/"):
    crawer = Crawer()
    files = tools.get_files(path)
    for name in files:
        content = tools.read_lines(path+name)
        fail_content = ""
        save_content = ""
        crawer.writeIntofile(save_path+name,"")
        crawer.writeIntofile(fail_save_path+name,"")
        succ_count,fail_count = 0,0
        for line in content:
            tmp = line.split(",")
            article = crawer.get_article(tmp[-1]).strip().replace("\n","")
            if len(article)>0:
                save_content += tmp[0]+","+tmp[1]+","+article+'\n'
                succ_count+=1
            else:
                fail_content += tmp[0]+","+tmp[1]+","+tmp[2]+'\n'
                fail_count+=1
                # fail_content.append(tmp)
        crawer.writeIntofile(save_path+name,save_content)
        crawer.writeIntofile(fail_save_path+name,fail_content)
        print(name,succ_count,fail_count)
Пример #15
0
            trainformat_sentences.append(tools.seperate(sen))
    return trainformat_sentences


def train(traindata, savepath=Dir.res + "/parameter/words_vector/w2v.model"):
    ftools.check_filename(savepath)
    model = Word2Vec(sentences=traindata,
                     size=200,
                     window=5,
                     min_count=3,
                     workers=4)
    model.save(savepath)


def load(path=Dir.res + "/parameter/words_vector/w2v.model"):
    model = Word2Vec.load(path)
    return model


if __name__ == "__main__":
    root = Dir.res + "/data/"
    flist = ftools.get_files(root)
    data = []
    count = 0
    for name in flist:
        path = root + name
        print(" %04d" % count, len(flist))
        count += 1
        data.extend(loaddata(path))
    train(data)
Пример #16
0
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/",
                     save_dir=Dir.res + "/cleandata_none"):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)

    files = tools.get_files(data_dir)
    cleandata = []
    count = 0
    bad_sample = []
    for i in range(len(files)):
        print(i, len(files), len(cleandata))
        fname = files[i]
        path = data_dir + fname
        lines = tools.read_lines(path)
        for line in lines:
            line = line.strip()

            # try:
            if 1:
                last_ = line.rindex(",")
                first_ = line.index(",")
                if first_ == last_:
                    continue
                tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]]
                abstracts = tls.seperate_sentences(tmp[1])
                news = tls.seperate_sentences(tmp[2])

                tmp = get_abstract_index(news, abstracts)

                count += 1
                if len(tmp) != len(abstracts):
                    continue
                # print(tmp)
                # cmd = input()
                # if "1" in cmd:
                #     print('\n'.join(abstracts))
                #     print("--------------------")
                #     print('\n'.join(news))
                #
                #     print("--------------------")
                #     print("words:",w_count)
                w_count = 0
                for li in news:
                    w_count += len(tls.seperate(li))
                if w_count < 520:
                    continue

                if sum(tmp[:3]) <= 3:
                    continue
                cleandata.append([abstracts, news])
                tools.write(
                    save_dir + "/abstract/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(abstracts))
                tools.write(
                    save_dir + "/news/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(news))
            # except Exception as e:
            #     print(str(e),e.with_traceback(e.__traceback__))
            #     print("error",line)
            #     bad_sample.append(line)
    print(count, len(bad_sample), len(cleandata))