Exemplo n.º 1
0
def vectorize_files(fileroot,savepath):
    data = ftools.read_dir_lines_dict(fileroot)
    auto  = AutoCoder()
    count = 0
    print(len(data.keys()))
    for key in data.keys():

        text = '。'.join(data[key])

        sens, sens_words, sens_tags = auto.preprocess(text)
        start = time.time()
        sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags)
        end = time.time()
        key_text =''.join([''.join(var) for var in sens_words])

        save_key = tools.md5(key_text)
        tmp =[list(var) for var in sens_vector]

        save_object = [tmp,list(essay_vector)]

        tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key)

        count+=1

        print(count,len(data.keys()),end-start)
Exemplo n.º 2
0
 def save_value(self, path, key, coverage_list, relative_matrix, clues_list,
                entities_list):
     ftools.check_filename(path)
     save_dict = {}
     save_dict[key] = [
         coverage_list, relative_matrix, clues_list, entities_list
     ]
     tools.save_object(save_dict, path)
Exemplo n.º 3
0
 def save_value(self, path, text, coverage_list, relative_matrix,
                clues_list, entities_list):
     ftools.check_filename(path)
     save_dict = {}
     save_dict['#$#'.join(text)] = [
         coverage_list, relative_matrix, clues_list, entities_list
     ]
     tools.save_object(save_dict, path)
Exemplo n.º 4
0
def analyze(main_name, compare_index, name="cleandata_small"):
    save_path = Dir.res + "/result/judge.txt"
    jude_dict = tools.load_object(save_path)
    # print(list(jude_dict.keys())[0])

    print(len(jude_dict))

    entry_path = Dir.res + "/result/" + name + "/EntryBigraph/detials.txt"
    entry_data = load_data(entry_path)
    first_path = Dir.res + "/result/" + name + "/" + main_name + "/detials.txt"
    first_data = load_data(first_path)
    textrank_path = Dir.res + "/result/" + name + "/TextRank/detials.txt"
    tr_data = load_data(textrank_path)
    result = {}
    for key in first_data.keys():
        a = first_data[key][0] - entry_data[key][0]
        b = first_data[key][1] - entry_data[key][1]
        c = first_data[key][0] - tr_data[key][0]
        d = first_data[key][1] - tr_data[key][1]
        e = first_data[key][0] - tr_data[key][0] + entry_data[key][
            0] - tr_data[key][0]
        f = first_data[key][1] - tr_data[key][1] + entry_data[key][
            1] - tr_data[key][1]
        result[key] = [a, b, c, d, e, f]
    count = 0
    news_root = Dir.res + "/" + name + "/news/"
    abst_root = Dir.res + "/" + name + "/abstract/"
    fname = ftools.get_files(news_root)
    new_result = {}
    for filename in fname:
        # print(filename,count,len(fname))
        # news = ftools.read_lines(news_root+filename)
        # weibo = ftools.read_lines(abst_root+filename)
        # jude = data_filter(news,weibo)
        # jude_dict[filename] = jude
        jude = jude_dict[filename]
        if jude > 0.5:
            new_result[filename] = result[filename]
            new_result[filename].append(jude)
            count += 1
    tools.save_object(jude_dict, Dir.res + "/result/judge.txt")
    tmp = dict(
        sorted(new_result.items(),
               key=lambda d: d[1][compare_index],
               reverse=True))
    save_dict = {}
    names = []
    for key in tmp.keys():
        save_dict[key] = tmp[key]
        names.append(key)
    save_path = Dir.res + "/result/" + name + "/" + main_name + ".txt"
    ftools.write_com_dict(save_path, save_dict)
    return names
Exemplo n.º 5
0
    def vectorize(self,sens_words,sens_tags):
        key_text = ''.join([''.join(var) for var in sens_words])
        key = tools.md5(key_text)
        # print(key)
        if key in self.data.keys():
            tmp = self.data[key]
        else:
            print("trainning")
            tmp0,tmp1 = self.auto.vectorize(sens_words,sens_tags)
            tmp = [tmp0,tmp1]
            tmpsens = [list(var) for var in tmp0]

            save_object = [tmpsens, list(tmp1)]
            save_key = tools.md5(key_text)
            tools.save_object(save_object, Dir.res+"/encoder/cleandata_8700/" + save_key)
        # print(type(tmp))
        # print(len(tmp))
        return tmp[0],tmp[1]