Exemplo n.º 1
0
def craw_result_process(root=Dir.res + "/data/"):
    files = ftools.get_files(root)
    data = []
    for i in range(len(files)):
        filename = files[i]
        if len(data) > 10:
            break
        lines = ftools.read_lines(root + filename)
        for line in lines:
            tmp = line.split(",")
            # print("news",len(tmp[2]))
            # print("news",tmp[2])
            #
            # print("abstract",len(tmp[1]))
            # print("abstract",tmp[1])

            abstract = tools.seperate_sentences(tmp[1])
            news = tools.seperate_sentences(tmp[2])
            print(abstract)
            print(news)
            # input()
            jude = data_filter(news, abstract)
            if jude > 0.5:
                data.append(['\n'.join(abstract), '\n'.join(news)])
    return data
Exemplo n.º 2
0
 def textrank(self, text):
     sentences = tools.seperate_sentences(text)
     words = {}
     words_list = []
     res = {}
     sen_words = []
     for sen in sentences:
         ws = tools.seperate(sen)
         sen_words.append(ws)
         for w in ws:
             if w not in words.keys():
                 words_list.append(w)
                 words[w] = len(words)
     matrix = np.zeros((len(words), len(words)))
     # matrix = [[0] * len(words) for var in range(len(words))]
     for sen_w in sen_words:
         for i in range(len(sen_w)):
             for j in range(i, len(sen_w)):
                 # print(words[sen_w[i]],words[sen_w[j]],len(words))
                 matrix[words[sen_w[i]], words[sen_w[j]]] += 1
                 matrix[words[sen_w[j]], words[sen_w[i]]] += 1
     nx_graph = nx.from_numpy_matrix(matrix)
     nx_parameter = {'alpha': 0.85}
     score = nx.pagerank(nx_graph, **nx_parameter)
     sorted_score = sorted(score.items(),
                           key=lambda item: item[1],
                           reverse=True)
     for index, value in sorted_score:
         if words_list[index] not in res.keys():
             res[words_list[index]] = value
     return res
Exemplo n.º 3
0
    def weighted_vectorize(self, text):
        res = []
        sentences = tools.seperate_sentences(text)
        tr_text = self.tr.textrank(text)
        for sen in sentences:
            tmp = []
            tmp_weight = []
            sen_words = tools.seperate(sen)
            for w in sen_words:
                if self.model.wv.vocab.__contains__(w):
                    tmp.append(self.model.__getitem__(w))
                    if w in tr_text:
                        tmp_weight.append(tr_text[w])
                    else:
                        tmp_weight.append(1 / len(sen_words))
                else:
                    tmp.append([0] * self.vec_length)
                    tmp_weight.append(1 / len(sen_words))
            for i in range(len(tmp)):
                tmp[i] = tools.vector_multi(tmp[i],
                                            tmp_weight[i] / sum(tmp_weight))

            sen_vec = tools.vector_add_multi(tmp)
            if len(sen_vec) == 0:
                print(sen)
            res.append(sen_vec)
        return res
Exemplo n.º 4
0
 def generate_data(self,text):
     if isinstance(text,str):
         sens = tools.seperate_sentences(text)
     else:
         sens = text
     words =[]
     sen_words = []
     for sen in sens:
         wp = tools.sen_pog( sen)
         tmp = []
         for w,p in wp:
             if "n" in p or "v" in p or "m" in p:
                 tmp.append(w)
                 if w not in words:
                     words.append(w)
         sen_words.append(tmp)
     vector = []
     for sen_w in sen_words:
         tmp =[0]*len(words)
         for i in range(len(words)):
             w=  words[i]
             if w in sen_w:
                 tmp[i] = 1
         vector.append(tmp)
     return words,vector
Exemplo n.º 5
0
 def summarize(self, essay, num=3):
     sentences = tools.seperate_sentences(essay)
     if sentences.__len__() <= num:
         return sentences
     # print(sentences.__len__())
     mid_graph = self.build_graph(sentences)
     graph = self.generate_normal_graph(mid_graph[1])
     # print_graph(graph)
     if graph.__len__() == 0:
         return sentences[:num]
     au, hub = HITS.HITS(graph)
     sorted_au = sorted(au.items(), key=lambda item: item[1], reverse=True)
     sorted_hub = sorted(hub.items(),
                         key=lambda item: item[1],
                         reverse=True)
     result = []
     for res in sorted_au[:num]:
         # print(res)
         result.append(int(res[0]))
     result.sort()
     abstract = []
     for res in result:
         abstract.append(sentences[res])
     # for sent in abstract:
     #     print(sent)
     return abstract
Exemplo n.º 6
0
 def analyze(self, text):
     sens_words, sens_tag = [], []
     sens = tools.seperate_sentences(text)
     for sen in sens:
         tmp_words, tmp_tag = tools.seperate_pog(sen)
         sens_words.append(tmp_words)
         sens_tag.append(tmp_tag)
     return sens, sens_words, sens_tag
Exemplo n.º 7
0
 def preprocess(self,text):
     sens_words, sens_tag = [], []
     sens = tools.seperate_sentences(text)
     for sen in sens:
         tmp_words, tmp_tag = [], []
         for w, t in tools.sen_pog(sen):
             tmp_words.append(w)
             tmp_tag.append(t)
         sens_words.append(tmp_words)
         sens_tag.append(tmp_tag)
     return sens, sens_words, sens_tag
Exemplo n.º 8
0
 def get_sens_words(self,text):
     sens = tools.seperate_sentences(text)
     sens_words = []
     for line in sens:
         words, tags = tools.seperate_pog(line)
         for i in range(len(words)):
             w = words[i]
             if w not in self.words_tags_dict.keys():
                 self.words_tags_dict[w] = tags[i]
         sens_words.append(words)
     return sens_words
Exemplo n.º 9
0
 def vectorize(self, text):
     sens = tools.seperate_sentences(text)
     matrix = []
     for sen in sens:
         tmp = tools.sen_pog(sen)
         pog_tmp = []
         for w, p in tmp:
             if p == "n" or "v" in p:
                 pog_tmp.append(w)
         matrix.append(pog_tmp)
     tr_res = self.tr.textrank_matrix(matrix)
Exemplo n.º 10
0
 def analyze(self, text):
     sens_words, sens_tag = [], []
     sens = tools.seperate_sentences(text)
     tmp = []
     for sen in sens:
         if "原标题" in sen:
             continue
         tmp.append(sen)
         tmp_words, tmp_tag = tools.seperate_pog(sen)
         sens_words.append(tmp_words)
         sens_tag.append(tmp_tag)
     return tmp, sens_words, sens_tag
Exemplo n.º 11
0
def text2pic(text):
    sens = tools.seperate_sentences(text)
    nodes = []

    nodes_dict = {}
    sen_words =[]
    sen_noun_words =[]
    for sen in sens:
        wp = tools.sen_pog(sen)
        tmp_w =[]
        tmp_p = []
        tmp_noun =[]
        for w,p in wp:
            if "n" in p or "v" in p or "m" in p:
                if w not in nodes:
                    nodes.append(w)
                if w not in nodes_dict.keys():
                    nodes_dict[w] = 0
                nodes_dict[w]+=1
                tmp_noun.append(w)
            # tmp.append([w,p])
            tmp_w.append(w)
            tmp_p.append(p)
        sen_noun_words.append(tmp_noun)
        sen_words .append([tmp_w,tmp_p])
    # nodes = []
    # tmp = sorted(nodes_dict.items(), key= lambda d:d[1],reverse=True)
    # for var,count in tmp:
    #     nodes.append(var)
    #
    # print(tmp)

    matrix = [[0]*len(nodes) for var in range(len(nodes))]
    for k in range(len(sen_noun_words)):
        var = sen_noun_words[k]
        for i in range(len(var)-1):
            for j in range(i+1,len(var)):
                #
                matrix[nodes.index(var[i])][nodes.index(var[j])] += 1
                matrix[nodes.index(var[j])][nodes.index(var[i])] += 1
                # nouni_index = sen_words[k][0].index(var[i])
                # nounj_index = sen_words[k][0].index(var[j])
                # if nouni_index == nounj_index-1 and True:
                #     matrix[nodes.index(var[i])][nodes.index(var[j])] +=1
                #     matrix[nodes.index(var[j])][nodes.index(var[i])] +=1
                # else:
                # for p in sen_words[k][1][nouni_index:nounj_index]:
                #     if "v" in p or "m" in p:
                #         matrix[nodes.index(var[i])][nodes.index(var[j])] += 1
                #         matrix[nodes.index(var[j])][nodes.index(var[i])] += 1
                #         break
    return matrix,nodes
Exemplo n.º 12
0
    def preprocess(self, text):
        sens_words, sens_tag = [], []
        sens = tools.seperate_sentences(text)
        tmp = []
        for i in range(1, len(sens)):
            sen = sens[i]
            # for sen in sens:
            if "原标题" in sen:
                continue
            tmp.append(sen)
            tmp_words, tmp_tag = tools.seperate_pog(sen)
            sens_words.append(tmp_words)
            sens_tag.append(tmp_tag)

        return tmp, sens_words, sens_tag
Exemplo n.º 13
0
 def unweighted_vectorize(self, text):
     res = []
     sentences = tools.seperate_sentences(text)
     for line in sentences:
         tmp = []
         for word in tools.seperate(line):
             if self.model.wv.vocab.__contains__(word):
                 wv = self.model.__getitem__(word)
                 tmp.append(wv)
             else:
                 tmp.append([0] * self.vec_length)
         tmp = tools.vector_add_multi(tmp)
         tmp = tools.vector_multi(tmp, 1 / (len(tmp)))
         res.append(tmp)
     return res
Exemplo n.º 14
0
def sum2pic(text,nodes):
    sens = tools.seperate_sentences(text)
    sen_n =[]
    sen_w =[]
    sen_p = []
    # nodes_dict = {}
    for sen in sens:
        wp = tools.sen_pog(sen)
        tmp_sen_n =[]
        tmp_sen_w = []
        tmp_sen_p =[]
        for w,p in wp:
            if  ("n" in p or "v" in p or "m" in p )and w in nodes:
                tmp_sen_n.append(w)
            # if w not in nodes_dict.keys():
            #     nodes_dict[w] = 0
            # nodes_dict[w] += 1
            tmp_sen_w.append(w)
            tmp_sen_p.append(p)
        sen_n.append(tmp_sen_n)
        sen_w.append(tmp_sen_w)
        sen_p.append(tmp_sen_p)

    # nodes = []
    # tmp = sorted(nodes_dict.items(), key=lambda d: d[1], reverse=True)
    # for var, count in tmp:
    #     nodes.append(var)

    # print(tmp)

    matrix = [[0]*len(nodes) for var in range(len(nodes))]
    for i in range(len(sen_n)):

        for j in range(len(sen_n[i])):
            for k in range(j+1,len(sen_n[i])):
                # nouni_index = sen_w[i].index(sen_n[i][j])
                # nounj_index = sen_w[i].index(sen_n[i][k])
                matrix[nodes.index(sen_n[i][j])][nodes.index(sen_n[i][k])] += 1
                matrix[nodes.index(sen_n[i][k])][nodes.index(sen_n[i][j])] += 1
                # if nouni_index == nounj_index-1 and True :
                #     matrix[nodes.index(sen_n[i][j])][nodes.index(sen_n[i][k])] +=1
                #     matrix[nodes.index(sen_n[i][k])][nodes.index(sen_n[i][j])] +=1
                # for p in sen_p[i][nouni_index:nounj_index+1]:
                #     if "v" in p or "m" in p:
                #         matrix[nodes.index(sen_n[i][j])][nodes.index(sen_n[i][k])] += 1
                #         matrix[nodes.index(sen_n[i][k])][nodes.index(sen_n[i][j])] += 1
                #         break
    return matrix
Exemplo n.º 15
0
def loaddata(path):
    # flist = ftools.get_files(data_root)

    # count =1
    # for name in flist:
    #     print(count,len(flist))
    #     count+=1
    #     path = data_root+name
    trainformat_sentences = []
    content = ftools.read_lines(path)
    for line in content:
        article = line[line.rindex(",") + 1:]
        sentences = tools.seperate_sentences(article)
        for sen in sentences:
            trainformat_sentences.append(tools.seperate(sen))
    return trainformat_sentences
Exemplo n.º 16
0
 def vectorize(self, text):
     sens = tools.seperate_sentences(text)
     short_text = []
     for sen in sens:
         short_text.append(self.ltp.short_sentences(sen))
     s_w_tr = self.tr.textrank_matrix(short_text)
     sen_vs = []
     for sen in sens:
         tmp = []
         for w in s_w_tr.keys():
             if w in sen:
                 tmp.append(s_w_tr[w])
             else:
                 tmp.append(0.0)
         sen_vs.append(tmp)
     return sen_vs
Exemplo n.º 17
0
 def vectorize(self, text):
     sentences = tools.seperate_sentences(text)
     res = []
     words = {}
     sen_w = []
     for i in range(len(sentences)):
         sen_words = tools.seperate(sentences[i])
         sen_w.append(sen_words)
         for w in sen_words:
             if w not in words.keys():
                 words[w] = len(words)
     for i in range(len(sen_w)):
         tmp = [0] * len(words)
         for var in sen_w[i]:
             tmp[words[var]] += 1
         res.append(tmp)
     return res
Exemplo n.º 18
0
    def summarize(self,essay,num=3,fname = None):
        sentences = tools.seperate_sentences(essay)
        if sentences.__len__() <= num:
            return sentences
        # print(sentences.__len__())
        mid_graph = self.build_graph(sentences)
        bigraph = self.generate_bigraph(mid_graph)
        graph = self.generate_normal_graph(mid_graph[1])
        # print_graph(graph)
        au,hub = HITS.HITS(bigraph)

        od = {}
        for node in graph.keys():
            od[node] = len(graph[node])/(node+1)
        e = {}
        for node in mid_graph[1].keys():
            e[node] = len(mid_graph[1][node])

        options = self.optimization(au,od,e)
        abstract = []
        for var in options:
            abstract.append(sentences[var])

        return abstract
Exemplo n.º 19
0
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/",
                     save_dir=Dir.res + "/cleandata_none"):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)

    files = tools.get_files(data_dir)
    cleandata = []
    count = 0
    bad_sample = []
    for i in range(len(files)):
        print(i, len(files), len(cleandata))
        fname = files[i]
        path = data_dir + fname
        lines = tools.read_lines(path)
        for line in lines:
            line = line.strip()

            # try:
            if 1:
                last_ = line.rindex(",")
                first_ = line.index(",")
                if first_ == last_:
                    continue
                tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]]
                abstracts = tls.seperate_sentences(tmp[1])
                news = tls.seperate_sentences(tmp[2])

                tmp = get_abstract_index(news, abstracts)

                count += 1
                if len(tmp) != len(abstracts):
                    continue
                # print(tmp)
                # cmd = input()
                # if "1" in cmd:
                #     print('\n'.join(abstracts))
                #     print("--------------------")
                #     print('\n'.join(news))
                #
                #     print("--------------------")
                #     print("words:",w_count)
                w_count = 0
                for li in news:
                    w_count += len(tls.seperate(li))
                if w_count < 520:
                    continue

                if sum(tmp[:3]) <= 3:
                    continue
                cleandata.append([abstracts, news])
                tools.write(
                    save_dir + "/abstract/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(abstracts))
                tools.write(
                    save_dir + "/news/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(news))
            # except Exception as e:
            #     print(str(e),e.with_traceback(e.__traceback__))
            #     print("error",line)
            #     bad_sample.append(line)
    print(count, len(bad_sample), len(cleandata))
Exemplo n.º 20
0
 def ed_sentence(self, essay):
     self.sentence = tools.seperate_sentences(essay)
 def summarize(self, essay, num=3, fname=None):
     sentences = tools.seperate_sentences(essay)
     return sentences[:num]