Exemplo n.º 1
0
def filter(path=Dir.res + "/extradata/"):
    # print(os.path.abspath(path))
    news_path = path + "news/"
    abstract_path = path + "abstract/"

    news_file_list = os.listdir(news_path)
    abst_file_list = os.listdir(abstract_path)

    bad_sample = []
    news = []
    for name in news_file_list:
        # if name in bad_sample:
        #     continue
        news.append(ftools.read_lines(news_path + name))
    abstracts = []
    for name in abst_file_list:
        # if name in bad_sample:
        #     continue
        abstracts.append(ftools.read_lines(abstract_path + name))

    res = []
    res_sen = []
    for i in range(len(news)):
        # print(news_file_list[i], abst_file_list[i], True if news_file_list[i] == abst_file_list[i] else False)
        matrix = [[0 for var in range(len(news[i]))]
                  for var in range(len(abstracts[i]))]
        tmp = []
        tmp_sen = []
        try:
            for k in range(len(abstracts[i])):
                # print(abstracts[i][k])
                for j in range(len(news[i])):
                    matrix[k][j] = len(
                        crpss.longest_common_subsequence(
                            news[i][j], abstracts[i][k]))
                # print(matrix[k].index(max(matrix[k])),news[i][matrix[k].index(max(matrix[k]))])
                max_index = matrix[k].index(max(matrix[k]))
                tmp.append(max_index)
                tmp_sen.append(news[i][max_index])
            # print(len(tmp),True if len(tmp) == len(abstracts[i]) else False)
        except:
            bad_sample.append(news_file_list[i])
            # print(news_file_list[i])

        res.append([news_file_list[i]] + tmp)
        res_sen.append([news_file_list[i]] + tmp_sen)
    # for bb in bad_sample:
    #     print(bb)

    #     res.append(tmp)
    # print(bad_sample)
    # for i in range(len(res)):
    #     tmp = res[i]
    #     print(news_file_list[i],tmp,len(news[i]),len(abstracts[i]) , True if len(abstracts[i] ) == len(tmp) else False)
    return res, res_sen
Exemplo n.º 2
0
 def rouge_detail(self,abstract_processed,save_dir):
     flist = tools.get_files(abstract_processed)
     save_content = []
     for fname in flist:
         content = tools.read_lines(abstract_processed+fname)
         refence = tools.read_lines(self.ref_processed+fname)
         lines =[line.split(" ") for line in content]
         refen =[line.split(" ") for line in refence]
         rouge1 = self.rouge_1_simple(refen,lines)
         rouge2 = self.rouge_2_simple(refen, lines)
         save_content.append(fname+","+str(rouge1)+","+str(rouge2))
     tools.write_list(save_dir+"/detials.txt",save_content)
Exemplo n.º 3
0
def rouge_detail():
    fname = "trainning_3570.txt"
    content = ftools.read_lines(
        Dir.res +
        "/result/cleandata_small/Second Version/abstract_processed/" + fname)
    refence = ftools.read_lines(Dir.res + "/cleandata_small/ref_processed/" +
                                fname)
    lines = [line.split(" ") for line in content]
    refen = [line.split(" ") for line in refence]
    # print(lines)
    # print(refen)
    rouge1 = rouge_1_simple(refen, lines)
    rouge2 = rouge_2_simple(refen, lines)
    print(rouge1, rouge2)
Exemplo n.º 4
0
    def train(self,
              dimension=200,
              iter=10,
              trainfile=Dir.res + "WikiCorpus/wiki.jian.seperate.txt",
              load_model_if_exits=True):
        model_path = Dir.res + "/W2V/w2v_" + str(dimension) + ".model"
        if os.path.exists(model_path) and load_model_if_exits:
            self.model = Word2Vec.load(model_path)
            return self.model
        tmp = tools.read_lines(trainfile)
        index = 0
        for string in tmp:
            words = (string.split(" "))
            self.corpus.append(words)
            # print(words)
            # index+=1
            # print(index)
            # Doc2Vec()
        self.model = Word2Vec(self.corpus,
                              size=dimension,
                              iter=iter,
                              min_count=5)
        path = Dir.res + "W2V/w2v_" + str(dimension) + ".model"
        if not os.path.lexists(Dir.res + "W2V/"):
            os.makedirs(Dir.res + "W2V/")

        self.model.save(path)
        return self.model
Exemplo n.º 5
0
def craw_result_process(root=Dir.res + "/data/"):
    files = ftools.get_files(root)
    data = []
    for i in range(len(files)):
        filename = files[i]
        if len(data) > 10:
            break
        lines = ftools.read_lines(root + filename)
        for line in lines:
            tmp = line.split(",")
            # print("news",len(tmp[2]))
            # print("news",tmp[2])
            #
            # print("abstract",len(tmp[1]))
            # print("abstract",tmp[1])

            abstract = tools.seperate_sentences(tmp[1])
            news = tools.seperate_sentences(tmp[2])
            print(abstract)
            print(news)
            # input()
            jude = data_filter(news, abstract)
            if jude > 0.5:
                data.append(['\n'.join(abstract), '\n'.join(news)])
    return data
Exemplo n.º 6
0
def load_data(path):
    lines = ftools.read_lines(path)
    data = {}
    for line in lines:
        tmp = line.split(",")
        data[tmp[0]] = [float(tmp[1]), float(tmp[2])]
    return data
Exemplo n.º 7
0
 def load_word_index(self,path):
     lines = tools.read_lines(path)
     for line in lines:
         index = line.rindex(":")
         # print(line[:index])
         # print(line[index+1:])
         self.word_index[line[:index]] = int(line[index+1:])
Exemplo n.º 8
0
def replace_words_by_num(whole_words,file_dir,save_dir):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)
    filename = []
    def filter(s):
        if "all" in s:
            return True
        return False
    tools.get_filelist(file_dir,filename,filter)
    content = {}
    for file in filename:
        lines = tools.read_lines(file)
        string = ""
        for line in lines:
            words = line.split(" ")
            for word in words:
                if word.__len__()>0:
                    if word in whole_words.keys():
                        string+= str(whole_words[word])+" "
            string = string.strip()
            string+="\n"
        content[tools.get_name(file)] = string
        # print(string)
        # input()
    for name in content:
        savepath = save_dir+name+".txt"
        tools.write(savepath,content[name])
Exemplo n.º 9
0
 def load(self):
     path = Dir.res + "/cleandata_highquality_1640/abstract/"
     for name in ftools.get_files(path):
         tmp = ftools.read_lines(path + name)
         self.answer[name] = []
         for var in tmp:
             if len(var.strip()) <= 5:
                 continue
             self.answer[name].append(var)
Exemplo n.º 10
0
def update_rouge_details(dataname="cleandata_small", modelname="EntryBigraph"):
    ref_root = Dir.res + "/" + dataname + "/ref_processed/"
    abs_root = Dir.res + "/result/" + dataname + "/" + modelname + "/abstract_processed/"
    detail_path = Dir.res + "/result/" + dataname + "/" + modelname + "/detials.txt"
    filelist = ftools.get_files(ref_root)
    content = ""
    for i in range(len(filelist)):
        fname = filelist[i]
        print(i, len(filelist))
        abstract = ftools.read_lines(abs_root + fname)
        refence = ftools.read_lines(ref_root + fname)
        lines = [line.split(" ") for line in abstract]
        refen = [line.split(" ") for line in refence]
        rouge1 = rouge_1_simple(refen, lines)
        rouge2 = rouge_2_simple(refen, lines)
        print(fname, rouge1, rouge2)
        content += fname + "," + str(rouge1) + "," + str(rouge2) + "\n"

    ftools.write(detail_path, content)
Exemplo n.º 11
0
def build_w2v_train_data():
    file_dir = Dir.res + "data/news.sentences/"
    save_path = Dir.res + "data/all.txt"
    filelist = []
    content = []
    tools.get_filelist(file_dir, filelist)
    for file in filelist:
        sentences = tools.read_lines(file)
        content.extend(sentences)
    tools.write_list(save_path, content)
Exemplo n.º 12
0
def load_vectorize_files(vectorize_path):
    lines = ftools.read_lines(vectorize_path)
    res = {}
    for line in lines:
        seperate_point = line.rindex("\t")
        key = line[:seperate_point]
        content = seperate_point[seperate_point+1:][2:-2]
        vectors = [float(var) for var in content.split("','")]
        if key not in res.keys():
            res[key] = vectors
    return res
Exemplo n.º 13
0
def get_result(dataname="cleandata_highquality_3500"):
    root = Dir.res + "/result/" + dataname + "/"
    flist = ftools.get_files(root)
    content = ""
    for name in flist:
        if ".txt" in name:
            continue
        lines = ftools.read_lines(root + name + "/eval_res.txt")
        content += name + ", " + lines[1][lines[1].index("[") +
                                          1:lines[1].index("]")] + "\n"
    print(content)
    ftools.write(Dir.res + "/result/" + dataname + "/result.txt", content)
Exemplo n.º 14
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         for line in lines:
             words = tools.seperate(line)
             data.append(TaggedDocument(words, ["sen_" + str(count)]))
             self.sen_dict[''.join(words)] = "sen_" + str(count)
             count += 1
     return data
Exemplo n.º 15
0
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"):
    filelist = os.listdir(cleandata_root)
    lines = []
    for name in filelist:
        filepath = cleandata_root+name
        for line in ftools.read_lines(filepath):
            words = tools.seperate(line)
            for i in range(len(words)):
                if words[i].isdigit():
                    words[i] = "num"
            lines.append(' '.join(words))

    ftools.write_list(save_path,lines)
Exemplo n.º 16
0
def get_small_data():
    root = Dir.res + "/cleandata_8700/"
    saveroot = Dir.res + "/cleandata_small/"

    flist = ftools.get_files(root + "news/")
    count = 0
    for i in range(len(flist)):
        name = flist[i]
        content = ftools.read_lines(root + "news/" + name)
        if len(content) < 80:
            print(count, i, len(flist))
            ftools.copy(root + "news/" + name, saveroot + "news/" + name)
            ftools.copy(root + "abstract/" + name,
                        saveroot + "abstract/" + name)
            count += 1
Exemplo n.º 17
0
def generate_new_data():
    npath = Dir.res + "/cleandata_highquality_3500/news/"
    # apath = Dir.res+"/cleandata_highquality_3500/abstract/"

    new_npath = Dir.res + "/cleandata_highquality_3500_new/news/"
    new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/"

    uper = Uper()

    for name in ftools.get_files(npath):
        path = npath + name
        content = ftools.read_lines(path)
        new_abstract = uper.summarize(content, num=3, fname=name[:-4])
        ftools.copy(path, new_npath + name)
        ftools.write_list(new_apath + name, new_abstract)
Exemplo n.º 18
0
def loaddata(path):
    # flist = ftools.get_files(data_root)

    # count =1
    # for name in flist:
    #     print(count,len(flist))
    #     count+=1
    #     path = data_root+name
    trainformat_sentences = []
    content = ftools.read_lines(path)
    for line in content:
        article = line[line.rindex(",") + 1:]
        sentences = tools.seperate_sentences(article)
        for sen in sentences:
            trainformat_sentences.append(tools.seperate(sen))
    return trainformat_sentences
Exemplo n.º 19
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         essay = ""
         tmp = []
         for line in lines:
             words = tools.seperate(line)
             tmp.extend(words)
             essay += ''.join(words)
         data.append(TaggedDocument(tmp, ["text_" + str(count)]))
         self.sen_dict[essay] = "text_" + str(count)
         count += 1
     return data
Exemplo n.º 20
0
def generate_data(file=Dir.res +
                  "/extract_data_process/data_processed_9.9.txt",
                  savePath=Dir.res + "/extract_data_process/data"):
    content = tools.read_lines(file)[1:-1]
    data = {}
    for file in content:
        file = file.replace("&nbsp;", "")
        tmp = str(file[1:-1]).split("', '")
        if tmp[1] not in data.keys():
            data[tmp[1]] = tmp[2]
    index = 0
    for key in sorted(data.keys()):
        save_content = savePath + "/news/training_" + str(index) + ".txt"
        save_abstract = savePath + "/abstract/training_" + str(index) + ".txt"
        tools.write_list(save_content, seperate_sentences(data[key]))
        tools.write_list(save_abstract, seperate_sentences(key))
        index += 1
Exemplo n.º 21
0
def result_process(file_dir,save_dir):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)
    filenames = []
    tools.get_filelist(file_dir,filenames)
    for file in filenames:
        content  = tools.read_lines(file)
        name = tools.get_name(file)
        result =[]
        for line in content:
            words = jieba.cut(line)
            string = ""
            for word in words:
                string+= word+" "
            string = string[:-1]
            result.append(string)
            save_path = save_dir+"/"+name+".txt"
            tools.write_list(save_path,result)
Exemplo n.º 22
0
def build_word_index(file_dir,words_path):
    filename = []
    def filter(s):
        if "all" in s:
            return True
        return False
    tools.get_filelist(file_dir, filename, filter)
    whole_words = {}
    for file in filename:
        lines = tools.read_lines(file)
        for line in lines:
            words = list(jieba.cut(line))
            for word in words:
                if word.__len__() > 0:
                    if word not in whole_words.keys():
                        whole_words[word] = whole_words.__len__()
    word_index = ""
    for word in whole_words.keys():
        word_index += word + ":" + str(whole_words[word]) + "\n"
    tools.write(words_path, word_index)
    return whole_words
Exemplo n.º 23
0
def fill_all(path = Dir.res+"/craw_data/original/",save_path = Dir.res+"/craw_data/data/",fail_save_path = Dir.res+"/craw_data/fail/"):
    crawer = Crawer()
    files = tools.get_files(path)
    for name in files:
        content = tools.read_lines(path+name)
        fail_content = ""
        save_content = ""
        crawer.writeIntofile(save_path+name,"")
        crawer.writeIntofile(fail_save_path+name,"")
        succ_count,fail_count = 0,0
        for line in content:
            tmp = line.split(",")
            article = crawer.get_article(tmp[-1]).strip().replace("\n","")
            if len(article)>0:
                save_content += tmp[0]+","+tmp[1]+","+article+'\n'
                succ_count+=1
            else:
                fail_content += tmp[0]+","+tmp[1]+","+tmp[2]+'\n'
                fail_count+=1
                # fail_content.append(tmp)
        crawer.writeIntofile(save_path+name,save_content)
        crawer.writeIntofile(fail_save_path+name,fail_content)
        print(name,succ_count,fail_count)
Exemplo n.º 24
0
 def load_clue_words(self,
                     path=Dir.res +
                     "/parameter/summarization_parameter/clue_words"):
     list1 = ftools.read_lines(path)
     for var in list1:
         self.cluewords.add(var.strip())
Exemplo n.º 25
0
                    feed_dict={
                        self.xl:
                        [self.words2worvect(sens_words[i], words_bag)]
                    })
                sens_vec.append(list(sens_i_vec)[0])
            essay_vec = list(
                sess.run(self.encoder_op,
                         feed_dict={self.xl: [[1] * len(words_bag)]})[0])
        # endtime = time.time()
        # print(endtime-start)
        return sens_vec, essay_vec


if __name__ == "__main__":
    path = Dir.res + "/cleandata_small/news/trainning_2788.txt"
    text = ftools.read_lines(path)
    text = '。'.join(text)
    asv = Auto_Simple_Vec()
    sens, sens_words, sens_tags = asv.preprocess(text)

    # for var in sens_words:
    #     print(var)

    print("se_words lgth", len(sens_words))
    sen_vec, essay_vec = asv.vectorize(sens_words, sens_tags)
    # print(essay_vec)
    print(sens[0], sens[1])
    print(asv.dist.sim(sen_vec[0], sen_vec[1]))
    print(asv.dist.sim(sen_vec[0], sen_vec[-1]))
    print(asv.dist.sim(sen_vec[2], sen_vec[3]))
Exemplo n.º 26
0
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/",
                     save_dir=Dir.res + "/cleandata_none"):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)

    files = tools.get_files(data_dir)
    cleandata = []
    count = 0
    bad_sample = []
    for i in range(len(files)):
        print(i, len(files), len(cleandata))
        fname = files[i]
        path = data_dir + fname
        lines = tools.read_lines(path)
        for line in lines:
            line = line.strip()

            # try:
            if 1:
                last_ = line.rindex(",")
                first_ = line.index(",")
                if first_ == last_:
                    continue
                tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]]
                abstracts = tls.seperate_sentences(tmp[1])
                news = tls.seperate_sentences(tmp[2])

                tmp = get_abstract_index(news, abstracts)

                count += 1
                if len(tmp) != len(abstracts):
                    continue
                # print(tmp)
                # cmd = input()
                # if "1" in cmd:
                #     print('\n'.join(abstracts))
                #     print("--------------------")
                #     print('\n'.join(news))
                #
                #     print("--------------------")
                #     print("words:",w_count)
                w_count = 0
                for li in news:
                    w_count += len(tls.seperate(li))
                if w_count < 520:
                    continue

                if sum(tmp[:3]) <= 3:
                    continue
                cleandata.append([abstracts, news])
                tools.write(
                    save_dir + "/abstract/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(abstracts))
                tools.write(
                    save_dir + "/news/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(news))
            # except Exception as e:
            #     print(str(e),e.with_traceback(e.__traceback__))
            #     print("error",line)
            #     bad_sample.append(line)
    print(count, len(bad_sample), len(cleandata))
Exemplo n.º 27
0
def load_file(filepath):
    def filter(sen):
        return sen.strip()

    tmp = ftools.read_lines(filepath)
    return "。".join(map(filter, tmp))
Exemplo n.º 28
0
        sens_vect = []
        essay_key = []
        for sen in sens:
            essay_key.extend(sen)
            vec = self.sen2v.get_sen_vec(sen)
            # if vec == None:
            #     input()
            sens_vect.append(vec)
        essay_vector = self.doc2v.get_sen_vec(essay_key)
        return sens_vect, essay_vector


if __name__ == "__main__":

    # sen2v = Sen2Vec()
    # sen2v.train()
    # doc2v= Doc2Vec()
    # doc2v.train()

    sens = ftools.read_lines(Dir.res + "/cleandata_604/news/training_4.txt")
    pvdm_v = pvdm_vectorize()
    text = []
    for line in sens:
        text.append(tools.seperate(line))
    sens, essay = pvdm_v.vectorize(text)
    print(sens[0])
    for ss in sens:
        print(ss)

    # print(essay)
Exemplo n.º 29
0
        for var in num:
            em_vec[num[var]] = mau[var]
        for var in other:
            eo_vec[other[var]] = oau[var]
        essay_vector = en_vec + ev_vec + em_vec + eo_vec

        return sens_vecs, essay_vector


import Dir
if __name__ == "__main__":
    name = "training_4.txt"
    text_path = Dir.res + "/cleandata_604/news/" + name
    abstract_path = Dir.res + "/cleandata_604/abstract/" + name

    lines = ftools.read_lines(text_path)

    absts = ftools.read_lines(abstract_path)
    res = []
    for i in range(len(absts)):
        max_v, max_index = 0, 0
        for j in range(len(lines)):
            v = tools.sim(absts[i], lines[j])
            if v > max_v:
                max_v = v
                max_index = j
        res.append(max_index)
    print(res)

    sens, tags = [], []
    for line in lines:
Exemplo n.º 30
0
                option_score,tmp = self.summ.score_option(option, coverage_list, relative_matrix, clues_list, entities_list)

                # tmp.append(str(option_score))
                if option_score > max_value:
                    best_option = option
                    max_value = option_score
            abstract = [sens[var] for var in best_option]
            # print('\n'.join(tmp),max_value)
            return abstract
        else:
            print("using original summarizor")
            return self.summ.summarize(text,num)


if __name__ == "__main__":
    from src.tools import  FileTools as ftools
    test_file = Dir.res + "/cleandata_highquality_100/news/trainning_31.txt"
    text = ftools.read_lines(test_file)
    summ = FastSummarize(ASVec)
    print(summ.info)
    res = summ.summarize(text)
    for line in res:
        print(line)

    test_file = Dir.res + "/cleandata_highquality_100/news/trainning_32.txt"
    text = ftools.read_lines(test_file)
    summ = FastSummarize(ASVec)
    print(summ.info)
    res = summ.summarize(text)
    for line in res:
        print(line)