예제 #1
0
def get_clean_data(filenames, newname, name="cleandata_small"):
    import shutil
    path = Dir.res + "/result/" + name + "/Fourth Version auto encoder.txt"
    # lines = ftools.read_lines(path)
    # files = []
    # for i in range(len(lines)):
    #     line = lines[i].split(",")
    #     files.append(line[0])
    #     if len(files)>size:
    #         break
    nroot = Dir.res + "/" + name + "/news/"
    aroot = Dir.res + "/" + name + "/abstract/"

    # = "cleandata_highquality_1000"

    if ftools.isexists(Dir.res + "/" + newname + "/"):
        shutil.rmtree(Dir.res + "/" + newname + "/")
    if ftools.isexists(Dir.res + "/result/" + newname + "/"):
        shutil.rmtree(Dir.res + "/result/" + newname + "/")
    snroot = Dir.res + "/" + newname + "/news/"
    saroot = Dir.res + "/" + newname + "/abstract/"

    count = 0
    for name in filenames:
        count += 1
        print(count, len(filenames))
        ftools.copy(nroot + name, snroot + name)
        ftools.copy(aroot + name, saroot + name)
예제 #2
0
def generate_new_data():
    npath = Dir.res + "/cleandata_highquality_3500/news/"
    # apath = Dir.res+"/cleandata_highquality_3500/abstract/"

    new_npath = Dir.res + "/cleandata_highquality_3500_new/news/"
    new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/"

    uper = Uper()

    for name in ftools.get_files(npath):
        path = npath + name
        content = ftools.read_lines(path)
        new_abstract = uper.summarize(content, num=3, fname=name[:-4])
        ftools.copy(path, new_npath + name)
        ftools.write_list(new_apath + name, new_abstract)
예제 #3
0
def get_small_data():
    root = Dir.res + "/cleandata_8700/"
    saveroot = Dir.res + "/cleandata_small/"

    flist = ftools.get_files(root + "news/")
    count = 0
    for i in range(len(flist)):
        name = flist[i]
        content = ftools.read_lines(root + "news/" + name)
        if len(content) < 80:
            print(count, i, len(flist))
            ftools.copy(root + "news/" + name, saveroot + "news/" + name)
            ftools.copy(root + "abstract/" + name,
                        saveroot + "abstract/" + name)
            count += 1
예제 #4
0
def clean(path=Dir.res + "/extradata/", save=Dir.res + "/cleandata_1073/"):
    res = filter(path)
    clean_data = []
    for tmp in res:
        # print(tmp)
        if 0 not in tmp or 1 not in tmp:
            clean_data.append(tmp)
    for cd in clean_data:
        if cd[0] == "training_288.txt":
            print("skip------------------")
            continue
        print(cd[0])
        news_path = save + "news/" + cd[0]
        abstract_path = save + "abstract/" + cd[0]
        ftools.copy(path + "news/" + cd[0], news_path)
        ftools.copy(path + "abstract/" + cd[0], abstract_path)