def get_clean_data(filenames, newname, name="cleandata_small"): import shutil path = Dir.res + "/result/" + name + "/Fourth Version auto encoder.txt" # lines = ftools.read_lines(path) # files = [] # for i in range(len(lines)): # line = lines[i].split(",") # files.append(line[0]) # if len(files)>size: # break nroot = Dir.res + "/" + name + "/news/" aroot = Dir.res + "/" + name + "/abstract/" # = "cleandata_highquality_1000" if ftools.isexists(Dir.res + "/" + newname + "/"): shutil.rmtree(Dir.res + "/" + newname + "/") if ftools.isexists(Dir.res + "/result/" + newname + "/"): shutil.rmtree(Dir.res + "/result/" + newname + "/") snroot = Dir.res + "/" + newname + "/news/" saroot = Dir.res + "/" + newname + "/abstract/" count = 0 for name in filenames: count += 1 print(count, len(filenames)) ftools.copy(nroot + name, snroot + name) ftools.copy(aroot + name, saroot + name)
def generate_new_data(): npath = Dir.res + "/cleandata_highquality_3500/news/" # apath = Dir.res+"/cleandata_highquality_3500/abstract/" new_npath = Dir.res + "/cleandata_highquality_3500_new/news/" new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/" uper = Uper() for name in ftools.get_files(npath): path = npath + name content = ftools.read_lines(path) new_abstract = uper.summarize(content, num=3, fname=name[:-4]) ftools.copy(path, new_npath + name) ftools.write_list(new_apath + name, new_abstract)
def get_small_data(): root = Dir.res + "/cleandata_8700/" saveroot = Dir.res + "/cleandata_small/" flist = ftools.get_files(root + "news/") count = 0 for i in range(len(flist)): name = flist[i] content = ftools.read_lines(root + "news/" + name) if len(content) < 80: print(count, i, len(flist)) ftools.copy(root + "news/" + name, saveroot + "news/" + name) ftools.copy(root + "abstract/" + name, saveroot + "abstract/" + name) count += 1
def clean(path=Dir.res + "/extradata/", save=Dir.res + "/cleandata_1073/"): res = filter(path) clean_data = [] for tmp in res: # print(tmp) if 0 not in tmp or 1 not in tmp: clean_data.append(tmp) for cd in clean_data: if cd[0] == "training_288.txt": print("skip------------------") continue print(cd[0]) news_path = save + "news/" + cd[0] abstract_path = save + "abstract/" + cd[0] ftools.copy(path + "news/" + cd[0], news_path) ftools.copy(path + "abstract/" + cd[0], abstract_path)