def demo(): summarizor = Summarizor_luhn() essay = tools.read(Dir.resource + "\\extradata\\luhn\\training20.txt") # print(essay) result = summarizor.summarize(essay=essay) print("========================") for line in result: print(line)
def check_extract(file_dir, save_path): files = [] tools.get_filelist(file_dir, files, filter) extract_result = set() un_first_result = set() analysis_result = {} for file in files: # print(file) content = tools.read(file) content = re.sub("\[|\]|", "", content) lines = content.split("\n") for line in lines: tmp = line.split("', '") if tmp.__len__() == 3: extract = check_if_extract(tmp[1], tmp[2]) if extract[0]: extract_result.add(line) if tmp[0] not in analysis_result.keys(): analysis_result[tmp[0]] = [] analysis_result[tmp[0]] = extract[1] all_value = sum(extract[1][:-2]) supose_value = 0 low_ = get_sum(extract[1][-2]) hight_ = get_sum(extract[1][-1]) # print(tmp[0], all_value, low_, hight_, extract[1][:-2],extract[1][-2:]) # print(extract_result.__len__()) if all_value > low_ + 2: # print(tmp[0], all_value, low_, hight_, extract[1][:-2], extract[1][-2:]) un_first_result.add(line) print(extract_result.__len__(), un_first_result.__len__()) else: pass else: # print("format error",tmp.__len__()) # print(line) pass # print("exract",extract_result.__len__()) tools.write_list(save_path, extract_result) tools.write_list(save_path + ".txt", un_first_result)
def clean(data_dir=Dir.res + "/cleandata_8700/news/"): flist = tools.get_files(data_dir) # print(data_dir,len(flist)) for fname in flist: flag = False content = tools.read(data_dir + fname) if "3805" in fname: print(content) input() if "您的浏览器不支持video标签\n" in content: content = content.replace("您的浏览器不支持video标签\n", "") flag = True if "新闻 专题 微博" in content: flag = True content = content[:content.index("新闻 专题 微博")] if flag: print(fname) tools.write(data_dir + fname, content)
def preprocess_file(file, savepath): content = tools.read(file) result = preprocess(content) tools.write_list(savepath, result)
def load_files(self ,filedir): filenames ,data = [] tools.get_filelist(filedir ,filenames) for file in filenames: data.append(tools.read(file)) return data
def load_commonwords(self): # file= "E:\PythonWorkSpace\Summarizor\\resource\extradata\luhn\commonwords" file = Dir.resource + "/extradata/luhn/commonwords" self.commonwords = tools.read(file)