Пример #1
0
def craw_result_process(root=Dir.res + "/data/"):
    files = ftools.get_files(root)
    data = []
    for i in range(len(files)):
        filename = files[i]
        if len(data) > 10:
            break
        lines = ftools.read_lines(root + filename)
        for line in lines:
            tmp = line.split(",")
            # print("news",len(tmp[2]))
            # print("news",tmp[2])
            #
            # print("abstract",len(tmp[1]))
            # print("abstract",tmp[1])

            abstract = tools.seperate_sentences(tmp[1])
            news = tools.seperate_sentences(tmp[2])
            print(abstract)
            print(news)
            # input()
            jude = data_filter(news, abstract)
            if jude > 0.5:
                data.append(['\n'.join(abstract), '\n'.join(news)])
    return data
Пример #2
0
    def wiki_preprocess(self,
                        save_path=Dir.res +
                        "/WikiCorpus/wiki.jian.seperate.txt"):
        tmp_result = []
        # save_path = Dir.res+"WikiCorpus/wiki.jian.seperate.txt"
        index = 0
        with open(self.train_file, "r") as train_corpus:
            # print("read complete")
            index = 0
            for line in train_corpus:

                # print(line)
                # input()
                regex = "。。。。。。|?|。|!|;|\.\.\.\.\.\."
                sentences = re.split(regex, line)
                for sen in sentences:
                    words = list(jieba.cut(sen.strip()))
                new_line = ' '.join(words)
                tmp_result.append(new_line)
                if tmp_result.__len__() == 5000:
                    # print(index*5000 + 5000)
                    tools.write_list(save_path, tmp_result, mode="a")
                    index += 1
                    tmp_result = []
            # print(tmp_result.__len__())
            tools.write_list(save_path, tmp_result, mode="a")
Пример #3
0
 def save_value(self, path, key, coverage_list, relative_matrix, clues_list,
                entities_list):
     ftools.check_filename(path)
     save_dict = {}
     save_dict[key] = [
         coverage_list, relative_matrix, clues_list, entities_list
     ]
     tools.save_object(save_dict, path)
Пример #4
0
 def save_value(self, path, text, coverage_list, relative_matrix,
                clues_list, entities_list):
     ftools.check_filename(path)
     save_dict = {}
     save_dict['#$#'.join(text)] = [
         coverage_list, relative_matrix, clues_list, entities_list
     ]
     tools.save_object(save_dict, path)
Пример #5
0
def save_data(data, save_root):
    news_root = save_root + "/news/"
    abst_root = save_root + "/abstract/"
    for i in range(len(data)):
        fname = "trainning_" + str(i) + ".txt"

        ftools.write(abst_root + fname, data[i][0])
        ftools.write(news_root + fname, data[i][1])
Пример #6
0
def train(traindata, savepath=Dir.res + "/parameter/words_vector/w2v.model"):
    ftools.check_filename(savepath)
    model = Word2Vec(sentences=traindata,
                     size=200,
                     window=5,
                     min_count=3,
                     workers=4)
    model.save(savepath)
Пример #7
0
 def load(self):
     path = Dir.res + "/cleandata_highquality_1640/abstract/"
     for name in ftools.get_files(path):
         tmp = ftools.read_lines(path + name)
         self.answer[name] = []
         for var in tmp:
             if len(var.strip()) <= 5:
                 continue
             self.answer[name].append(var)
Пример #8
0
def build_w2v_train_data():
    file_dir = Dir.res + "data/news.sentences/"
    save_path = Dir.res + "data/all.txt"
    filelist = []
    content = []
    tools.get_filelist(file_dir, filelist)
    for file in filelist:
        sentences = tools.read_lines(file)
        content.extend(sentences)
    tools.write_list(save_path, content)
Пример #9
0
def filter(path=Dir.res + "/extradata/"):
    # print(os.path.abspath(path))
    news_path = path + "news/"
    abstract_path = path + "abstract/"

    news_file_list = os.listdir(news_path)
    abst_file_list = os.listdir(abstract_path)

    bad_sample = []
    news = []
    for name in news_file_list:
        # if name in bad_sample:
        #     continue
        news.append(ftools.read_lines(news_path + name))
    abstracts = []
    for name in abst_file_list:
        # if name in bad_sample:
        #     continue
        abstracts.append(ftools.read_lines(abstract_path + name))

    res = []
    res_sen = []
    for i in range(len(news)):
        # print(news_file_list[i], abst_file_list[i], True if news_file_list[i] == abst_file_list[i] else False)
        matrix = [[0 for var in range(len(news[i]))]
                  for var in range(len(abstracts[i]))]
        tmp = []
        tmp_sen = []
        try:
            for k in range(len(abstracts[i])):
                # print(abstracts[i][k])
                for j in range(len(news[i])):
                    matrix[k][j] = len(
                        crpss.longest_common_subsequence(
                            news[i][j], abstracts[i][k]))
                # print(matrix[k].index(max(matrix[k])),news[i][matrix[k].index(max(matrix[k]))])
                max_index = matrix[k].index(max(matrix[k]))
                tmp.append(max_index)
                tmp_sen.append(news[i][max_index])
            # print(len(tmp),True if len(tmp) == len(abstracts[i]) else False)
        except:
            bad_sample.append(news_file_list[i])
            # print(news_file_list[i])

        res.append([news_file_list[i]] + tmp)
        res_sen.append([news_file_list[i]] + tmp_sen)
    # for bb in bad_sample:
    #     print(bb)

    #     res.append(tmp)
    # print(bad_sample)
    # for i in range(len(res)):
    #     tmp = res[i]
    #     print(news_file_list[i],tmp,len(news[i]),len(abstracts[i]) , True if len(abstracts[i] ) == len(tmp) else False)
    return res, res_sen
Пример #10
0
 def rouge_detail(self,abstract_processed,save_dir):
     flist = tools.get_files(abstract_processed)
     save_content = []
     for fname in flist:
         content = tools.read_lines(abstract_processed+fname)
         refence = tools.read_lines(self.ref_processed+fname)
         lines =[line.split(" ") for line in content]
         refen =[line.split(" ") for line in refence]
         rouge1 = self.rouge_1_simple(refen,lines)
         rouge2 = self.rouge_2_simple(refen, lines)
         save_content.append(fname+","+str(rouge1)+","+str(rouge2))
     tools.write_list(save_dir+"/detials.txt",save_content)
Пример #11
0
def get_result(dataname="cleandata_highquality_3500"):
    root = Dir.res + "/result/" + dataname + "/"
    flist = ftools.get_files(root)
    content = ""
    for name in flist:
        if ".txt" in name:
            continue
        lines = ftools.read_lines(root + name + "/eval_res.txt")
        content += name + ", " + lines[1][lines[1].index("[") +
                                          1:lines[1].index("]")] + "\n"
    print(content)
    ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
Пример #12
0
def analyze(main_name, compare_index, name="cleandata_small"):
    save_path = Dir.res + "/result/judge.txt"
    jude_dict = tools.load_object(save_path)
    # print(list(jude_dict.keys())[0])

    print(len(jude_dict))

    entry_path = Dir.res + "/result/" + name + "/EntryBigraph/detials.txt"
    entry_data = load_data(entry_path)
    first_path = Dir.res + "/result/" + name + "/" + main_name + "/detials.txt"
    first_data = load_data(first_path)
    textrank_path = Dir.res + "/result/" + name + "/TextRank/detials.txt"
    tr_data = load_data(textrank_path)
    result = {}
    for key in first_data.keys():
        a = first_data[key][0] - entry_data[key][0]
        b = first_data[key][1] - entry_data[key][1]
        c = first_data[key][0] - tr_data[key][0]
        d = first_data[key][1] - tr_data[key][1]
        e = first_data[key][0] - tr_data[key][0] + entry_data[key][
            0] - tr_data[key][0]
        f = first_data[key][1] - tr_data[key][1] + entry_data[key][
            1] - tr_data[key][1]
        result[key] = [a, b, c, d, e, f]
    count = 0
    news_root = Dir.res + "/" + name + "/news/"
    abst_root = Dir.res + "/" + name + "/abstract/"
    fname = ftools.get_files(news_root)
    new_result = {}
    for filename in fname:
        # print(filename,count,len(fname))
        # news = ftools.read_lines(news_root+filename)
        # weibo = ftools.read_lines(abst_root+filename)
        # jude = data_filter(news,weibo)
        # jude_dict[filename] = jude
        jude = jude_dict[filename]
        if jude > 0.5:
            new_result[filename] = result[filename]
            new_result[filename].append(jude)
            count += 1
    tools.save_object(jude_dict, Dir.res + "/result/judge.txt")
    tmp = dict(
        sorted(new_result.items(),
               key=lambda d: d[1][compare_index],
               reverse=True))
    save_dict = {}
    names = []
    for key in tmp.keys():
        save_dict[key] = tmp[key]
        names.append(key)
    save_path = Dir.res + "/result/" + name + "/" + main_name + ".txt"
    ftools.write_com_dict(save_path, save_dict)
    return names
Пример #13
0
def get_dirfiles_into_list_luhn(file_dir,replace_dir):
    list,result  = [],{}
    tools.get_filelist(file_dir,list)
    for listfile in list:
        filename = tools.get_name(listfile)
        filename = filename[8:]
        if filename not in result.keys():
            result[filename ]= []
        if replace_dir == "":
            result[filename] = (listfile)
        else:
            result[filename].append(str(replace_dir + "/" + tools.get_name(listfile)+".txt"))
    return result
Пример #14
0
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"):
    filelist = os.listdir(cleandata_root)
    lines = []
    for name in filelist:
        filepath = cleandata_root+name
        for line in ftools.read_lines(filepath):
            words = tools.seperate(line)
            for i in range(len(words)):
                if words[i].isdigit():
                    words[i] = "num"
            lines.append(' '.join(words))

    ftools.write_list(save_path,lines)
Пример #15
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         for line in lines:
             words = tools.seperate(line)
             data.append(TaggedDocument(words, ["sen_" + str(count)]))
             self.sen_dict[''.join(words)] = "sen_" + str(count)
             count += 1
     return data
Пример #16
0
def rouge_detail():
    fname = "trainning_3570.txt"
    content = ftools.read_lines(
        Dir.res +
        "/result/cleandata_small/Second Version/abstract_processed/" + fname)
    refence = ftools.read_lines(Dir.res + "/cleandata_small/ref_processed/" +
                                fname)
    lines = [line.split(" ") for line in content]
    refen = [line.split(" ") for line in refence]
    # print(lines)
    # print(refen)
    rouge1 = rouge_1_simple(refen, lines)
    rouge2 = rouge_2_simple(refen, lines)
    print(rouge1, rouge2)
Пример #17
0
 def load_word_index(self,path):
     lines = tools.read_lines(path)
     for line in lines:
         index = line.rindex(":")
         # print(line[:index])
         # print(line[index+1:])
         self.word_index[line[:index]] = int(line[index+1:])
Пример #18
0
def vectorize_files(fileroot,savepath):
    data = ftools.read_dir_lines_dict(fileroot)
    auto  = AutoCoder()
    count = 0
    print(len(data.keys()))
    for key in data.keys():

        text = '。'.join(data[key])

        sens, sens_words, sens_tags = auto.preprocess(text)
        start = time.time()
        sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags)
        end = time.time()
        key_text =''.join([''.join(var) for var in sens_words])

        save_key = tools.md5(key_text)
        tmp =[list(var) for var in sens_vector]

        save_object = [tmp,list(essay_vector)]

        tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key)

        count+=1

        print(count,len(data.keys()),end-start)
Пример #19
0
    def train(self,
              dimension=200,
              iter=10,
              trainfile=Dir.res + "WikiCorpus/wiki.jian.seperate.txt",
              load_model_if_exits=True):
        model_path = Dir.res + "/W2V/w2v_" + str(dimension) + ".model"
        if os.path.exists(model_path) and load_model_if_exits:
            self.model = Word2Vec.load(model_path)
            return self.model
        tmp = tools.read_lines(trainfile)
        index = 0
        for string in tmp:
            words = (string.split(" "))
            self.corpus.append(words)
            # print(words)
            # index+=1
            # print(index)
            # Doc2Vec()
        self.model = Word2Vec(self.corpus,
                              size=dimension,
                              iter=iter,
                              min_count=5)
        path = Dir.res + "W2V/w2v_" + str(dimension) + ".model"
        if not os.path.lexists(Dir.res + "W2V/"):
            os.makedirs(Dir.res + "W2V/")

        self.model.save(path)
        return self.model
Пример #20
0
def load_data(path):
    lines = ftools.read_lines(path)
    data = {}
    for line in lines:
        tmp = line.split(",")
        data[tmp[0]] = [float(tmp[1]), float(tmp[2])]
    return data
Пример #21
0
def clean(path=Dir.res + "/extradata/", save=Dir.res + "/cleandata_1073/"):
    res = filter(path)
    clean_data = []
    for tmp in res:
        # print(tmp)
        if 0 not in tmp or 1 not in tmp:
            clean_data.append(tmp)
    for cd in clean_data:
        if cd[0] == "training_288.txt":
            print("skip------------------")
            continue
        print(cd[0])
        news_path = save + "news/" + cd[0]
        abstract_path = save + "abstract/" + cd[0]
        ftools.copy(path + "news/" + cd[0], news_path)
        ftools.copy(path + "abstract/" + cd[0], abstract_path)
Пример #22
0
 def __init__(self):
     self.auto = AutoCoder()
     self.name = "fast encoder"
     self.data ={}
     path = Dir.res+"/encoder/cleandata_8700/"
     fllist = ftools.get_files(path)
     for name in fllist:
         self.data[name] = tools.load_object(path+name)
Пример #23
0
def generate_data(file=Dir.res +
                  "/extract_data_process/data_processed_9.9.txt",
                  savePath=Dir.res + "/extract_data_process/data"):
    content = tools.read_lines(file)[1:-1]
    data = {}
    for file in content:
        file = file.replace("&nbsp;", "")
        tmp = str(file[1:-1]).split("', '")
        if tmp[1] not in data.keys():
            data[tmp[1]] = tmp[2]
    index = 0
    for key in sorted(data.keys()):
        save_content = savePath + "/news/training_" + str(index) + ".txt"
        save_abstract = savePath + "/abstract/training_" + str(index) + ".txt"
        tools.write_list(save_content, seperate_sentences(data[key]))
        tools.write_list(save_abstract, seperate_sentences(key))
        index += 1
Пример #24
0
def demo():
    summarizor = Summarizor_luhn()
    essay = tools.read(Dir.resource + "\\extradata\\luhn\\training20.txt")
    # print(essay)
    result = summarizor.summarize(essay=essay)
    print("========================")
    for line in result:
        print(line)
Пример #25
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         essay = ""
         tmp = []
         for line in lines:
             words = tools.seperate(line)
             tmp.extend(words)
             essay += ''.join(words)
         data.append(TaggedDocument(tmp, ["text_" + str(count)]))
         self.sen_dict[essay] = "text_" + str(count)
         count += 1
     return data
Пример #26
0
def get_clue_words(path=Dir.res + "/extradata/",
                   savepath=Dir.res +
                   "/parameter/summarization_parameter/clue_words",
                   word_index=3):
    _, res_sen = filter(path)
    words = {}
    for var in res_sen:
        for sen in var[1:]:
            ws = tools.seperate(sen)
            for w in ws[:word_index]:
                if w not in words.keys():
                    words[w] = 0
                words[w] += 1

    content = ""
    for w in words.keys():
        content += w + "," + str(words[w]) + "\n"
    ftools.write(savepath + str(word_index), content)
Пример #27
0
    def read_file(self, dir):
        filelist = []
        tools.get_filelist(dir, filelist)
        data = {}
        reverse_data = {}
        filelist = sorted(filelist)
        for filename in filelist:
            with open(filename, mode="r", encoding="utf-8") as file:
                content = file.read()
                sentences = self.seperate_sentences(content)
                data[filename] = sentences
                for sen in sentences:
                    if sen not in reverse_data.keys():
                        reverse_data[sen] = [tools.get_name(filename)]
                    else:

                        reverse_data[sen].append(tools.get_name(filename))
                        # print(sen,reverse_data[sen])
        return data, reverse_data
Пример #28
0
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"):
    ref_root = Dir.res + "/" + dataname + "/ref_processed/"
    abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/"
    detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt"
    filelist = ftools.get_files(ref_root)
    content = ""
    for i in range(len(filelist)):
        fname = filelist[i]
        print(i, len(filelist))
        abstract = ftools.read_lines(abs_root + fname)
        refence = ftools.read_lines(ref_root + fname)
        lines = [line.split(" ") for line in abstract]
        refen = [line.split(" ") for line in refence]
        rouge1 = rouge_1_simple(refen, lines)
        rouge2 = rouge_2_simple(refen, lines)
        print(fname, rouge1, rouge2)
        content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n"

    ftools.write(detail_path, content)
Пример #29
0
def load_vectorize_files(vectorize_path):
    lines = ftools.read_lines(vectorize_path)
    res = {}
    for line in lines:
        seperate_point = line.rindex("\t")
        key = line[:seperate_point]
        content = seperate_point[seperate_point+1:][2:-2]
        vectors = [float(var) for var in content.split("','")]
        if key not in res.keys():
            res[key] = vectors
    return res
Пример #30
0
    def craw_urls(self):
        start = 372
        for i in range(start, self.page_nums):
            request = Request.Request(self.url + str(i))
            for key in self.params.keys():
                request.add_header(key, self.params[key])
            response = Request.urlopen(request)

            html = response.read()
            html = html.decode('utf-8')
            infos = re.findall(self.url_regex, html)
            save_content = ""
            for info in infos:
                new_url = self.url_unqoate(info[-1])
                new_infor = [info[0], info[1], info[-1], new_url]
                save_content += self.seperator.join(new_infor) + "\n"

            tools.check_build_file(self.url_file)
            tools.write(self.url_file, content=save_content, mode="a")
            print(i, len(infos))