Пример #1
0
def demo():
    summarizor = Summarizor_luhn()
    essay = tools.read(Dir.resource + "\\extradata\\luhn\\training20.txt")
    # print(essay)
    result = summarizor.summarize(essay=essay)
    print("========================")
    for line in result:
        print(line)
Пример #2
0
def check_extract(file_dir, save_path):
    files = []
    tools.get_filelist(file_dir, files, filter)
    extract_result = set()
    un_first_result = set()
    analysis_result = {}
    for file in files:
        # print(file)
        content = tools.read(file)
        content = re.sub("\[|\]|", "", content)
        lines = content.split("\n")
        for line in lines:
            tmp = line.split("', '")
            if tmp.__len__() == 3:
                extract = check_if_extract(tmp[1], tmp[2])
                if extract[0]:
                    extract_result.add(line)
                    if tmp[0] not in analysis_result.keys():
                        analysis_result[tmp[0]] = []
                    analysis_result[tmp[0]] = extract[1]

                    all_value = sum(extract[1][:-2])
                    supose_value = 0
                    low_ = get_sum(extract[1][-2])
                    hight_ = get_sum(extract[1][-1])
                    # print(tmp[0], all_value, low_, hight_, extract[1][:-2],extract[1][-2:])
                    # print(extract_result.__len__())
                    if all_value > low_ + 2:
                        # print(tmp[0], all_value, low_, hight_, extract[1][:-2], extract[1][-2:])
                        un_first_result.add(line)

                    print(extract_result.__len__(), un_first_result.__len__())

                else:
                    pass
            else:
                # print("format error",tmp.__len__())
                # print(line)
                pass
        # print("exract",extract_result.__len__())
    tools.write_list(save_path, extract_result)
    tools.write_list(save_path + ".txt", un_first_result)
Пример #3
0
def clean(data_dir=Dir.res + "/cleandata_8700/news/"):
    flist = tools.get_files(data_dir)
    # print(data_dir,len(flist))
    for fname in flist:
        flag = False
        content = tools.read(data_dir + fname)
        if "3805" in fname:
            print(content)
            input()

        if "您的浏览器不支持video标签\n" in content:
            content = content.replace("您的浏览器不支持video标签\n", "")
            flag = True
        if "新闻 专题 微博" in content:
            flag = True
            content = content[:content.index("新闻 专题 微博")]

        if flag:
            print(fname)
            tools.write(data_dir + fname, content)
Пример #4
0
def preprocess_file(file, savepath):
    content = tools.read(file)
    result = preprocess(content)
    tools.write_list(savepath, result)
Пример #5
0
def load_files(self ,filedir):
    filenames ,data = []
    tools.get_filelist(filedir ,filenames)
    for file in filenames:
        data.append(tools.read(file))
    return data
Пример #6
0
 def load_commonwords(self):
     # file= "E:\PythonWorkSpace\Summarizor\\resource\extradata\luhn\commonwords"
     file = Dir.resource + "/extradata/luhn/commonwords"
     self.commonwords = tools.read(file)