def format_to_lines(args):
    print("in format lines")
    train_files = glob.glob(pjoin(args.train_path, './*.json'))
    valid_files = glob.glob(pjoin(args.valid_path, './*.json'))
    test_files = glob.glob(pjoin(args.test_path, './*.json'))
    print("test_files are ", test_files)
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            # if (len(dataset) > args.shard_size):
            pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                print(
                    "saved to ",
                    "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                 p_ct))
        pool.close()
        pool.join()
Exemplo n.º 2
0
def format_to_lines(args):
    corpus_mapping = {}
    train_files = []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        train_files.append(f)

    corpora = {'train': train_files}
    for corpus_type in ['train']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 3
0
def _files_dict_to_json(args, train_files, valid_files, test_files):
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 4
0
    def parse_files(self, workers, data_dicts=None):
        """Parse all files"""
        print("\n\n### Parsing files ###")
        os.chdir(self.workdir)  # questionable

        if data_dicts is None:
            data_dicts = [{"filename": fn.name} for fn in os.scandir(self.textdir)]
        filequeue = [
            {
                "name": d["filename"],
                "size": os.path.getsize(self.textdir + d["filename"]),
                "id": n + 1,
                "options": d["options"] if "options" in d else {},
                "newpath": self.textdir + d["filename"],
                "raw": self.workdir + d["filename"] + ".raw",
                "words": self.workdir + d["filename"] + ".words.sorted",
                "toms": self.workdir + d["filename"] + ".toms",
                "sortedtoms": self.workdir + d["filename"] + ".toms.sorted",
                "pages": self.workdir + d["filename"] + ".pages",
                "refs": self.workdir + d["filename"] + ".refs",
                "graphics": self.workdir + d["filename"] + ".graphics",
                "lines": self.workdir + d["filename"] + ".lines",
                "results": self.workdir + d["filename"] + ".results",
            }
            for n, d in enumerate(data_dicts)
        ]

        self.raw_files = [f["raw"] + ".lz4" for f in filequeue]

        self.metadata_hierarchy.append([])
        # Adding in doc level metadata
        for d in data_dicts:
            for k in list(d.keys()):
                if k not in self.metadata_fields:
                    self.metadata_fields.append(k)
                    self.metadata_hierarchy[0].append(k)
                if k not in self.metadata_types:
                    self.metadata_types[k] = "doc"
                    # don't need to check for conflicts, since doc is first.

        # Adding non-doc level metadata
        for element_type in self.parser_config["metadata_to_parse"]:
            if element_type != "page" and element_type != "ref" and element_type != "line":
                self.metadata_hierarchy.append([])
                for param in self.parser_config["metadata_to_parse"][element_type]:
                    if param not in self.metadata_fields:
                        self.metadata_fields.append(param)
                        self.metadata_hierarchy[-1].append(param)
                    if param not in self.metadata_types:
                        self.metadata_types[param] = element_type
                    else:  # we have a serious error here!  Should raise going forward.
                        pass

        print("%s: parsing %d files." % (time.ctime(), len(filequeue)))
        pool = Pool(workers)
        for results in pool.imap_unordered(self.__parse_file, zip(filequeue, data_dicts)):
            with open(results, "rb") as proc_fh:
                vec = pickle.load(proc_fh)  # load in the results from the child's parsework() function.
            self.omax = [max(x, y) for x, y in zip(vec, self.omax)]
        print("%s: done parsing" % time.ctime())
Exemplo n.º 5
0
def format_to_lines(args, raw_path, save_path):
    # file names: '4fd2a00e8eb7c8105d883bd7.json'
    name_list = os.listdir(raw_path)
    for i, name in enumerate(name_list):
        a_lst = [(pjoin(raw_path, f), args) for f in name_list]
    pool = Pool(args.n_cpus)
    dataset = []
    p_ct = 0
    for d in pool.imap_unordered(_format_to_lines, a_lst):
        dataset.append(d)
        if (len(dataset) > args.shard_size):
            pt_file = "{:s}.{:d}.json".format(save_path, p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []

    pool.close()
    pool.join()
    if (len(dataset) > 0):
        pt_file = os.path.join(save_path, 'test.json')
        with open(pt_file, 'w') as save:
            save.write(json.dumps(dataset))
            p_ct += 1
            dataset = []
Exemplo n.º 6
0
def format_to_nnsum(args, split_ratio=[0.8, 0.1, 0.1]):
    ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use
        for training SummaRunner and other baseline models.
    label_file: {id}.json
            {"id":"7f168bcf16ff08b32221d0c3993701dd502de584",
            "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}
    abstract_file: {id}.spl
            # nnsum paper uses tokenized words joined by space as each sentence,
            but uncased (both upper and lower case included)
    input_file: {id}.json
            {"input": [sent_1, sent_2, ..., sent_n], "id":story_id}
            sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER,
                    "word_count":word count of sent_i, "sentence_id":i}
            #sentence_id is from 1
            #The fields really used in the model are:
                "tokens", "text"
    '''
    output_dir = os.path.dirname(args.save_path)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    file_list = os.listdir(args.raw_path)
    file_list.sort(
        key=lambda f: (datetime.strptime(f.rsplit("_", 1)[0], '%Y_%m_%d'),
                       int(f.rsplit("_", 1)[1].split(".")[0])))
    file_list = ["%s/%s" % (args.raw_path, f) for f in file_list]
    #print(file_list)
    train_count, valid_count, test_count = [
        round(len(file_list) * x) for x in split_ratio
    ]
    print(train_count, valid_count, test_count)

    train_files = file_list[:train_count]
    valid_files = file_list[train_count:train_count + valid_count]
    test_files = file_list[train_count + valid_count:]

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        data_dir = pathlib.Path(args.save_path)
        input_dir = data_dir / "nnsum_inputs" / corpus_type
        label_dir = data_dir / "nnsum_labels" / corpus_type
        abstracts_dir = data_dir / "human-abstracts" / corpus_type
        input_dir.mkdir(exist_ok=True, parents=True)  # similar to 'mkdir -p'
        label_dir.mkdir(exist_ok=True, parents=True)
        abstracts_dir.mkdir(exist_ok=True, parents=True)
        a_lst = [(f, args, input_dir, abstracts_dir, label_dir)
                 for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        result_iter = pool.imap_unordered(_format_to_nnsum, a_lst)

        num_stories = len(a_lst)
        #randomly assigned the entries in a_lst to different processors in the pool
        for idx, result in enumerate(result_iter, 1):
            print("{}: Writing story {}/{}".format(corpus_type, idx,
                                                   num_stories),
                  end="\r" if idx < num_stories else "\n",
                  flush=True)

        pool.close()
        pool.join()
Exemplo n.º 7
0
def format_to_nnsum(args):
    ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use
        for training SummaRunner and other baseline models.
    label_file: {id}.json
            {"id":"7f168bcf16ff08b32221d0c3993701dd502de584",
            "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}
    abstract_file: {id}.spl
            # nnsum paper uses tokenized words joined by space as each sentence,
            but uncased (both upper and lower case included)
    input_file: {id}.json
            {"input": [sent_1, sent_2, ..., sent_n], "id":story_id}
            sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER,
                    "word_count":word count of sent_i, "sentence_id":i}
            #sentence_id is from 1
            #The fields really used in the model are:
                "tokens", "text"
    '''
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}

    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        data_dir = pathlib.Path(args.save_path)
        input_dir = data_dir / "nnsum_inputs" / corpus_type
        label_dir = data_dir / "nnsum_labels" / corpus_type
        abstracts_dir = data_dir / "human-abstracts" / corpus_type
        input_dir.mkdir(exist_ok=True, parents=True)  # similar to 'mkdir -p'
        label_dir.mkdir(exist_ok=True, parents=True)
        abstracts_dir.mkdir(exist_ok=True, parents=True)
        a_lst = [(f, args, input_dir, abstracts_dir, label_dir)
                 for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        result_iter = pool.imap_unordered(_format_to_nnsum, a_lst)

        num_stories = len(a_lst)
        #randomly assigned the entries in a_lst to different processors in the pool
        for idx, result in enumerate(result_iter, 1):
            print("{}: Writing story {}/{}".format(corpus_type, idx,
                                                   num_stories),
                  end="\r" if idx < num_stories else "\n",
                  flush=True)

        pool.close()
        pool.join()
Exemplo n.º 8
0
def format_to_lines(args):
    # load mapping files
    print('| Loading mapping files ...')
    corpus_mapping = {"train": [], "valid": [], "test": []}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        mapping_fp = os.path.join(args.map_path,
                                  "mapping_{}.txt".format(corpus_type))
        if not os.path.exists(mapping_fp):
            print(
                "Mapping file '{}' doesn't exist. Skip the type of mapping files."
                .format(mapping_fp))
            continue
        for line in open(mapping_fp):
            temp.append(hashhex(line.strip()))
            temp.append(line.strip())
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}

    # load corresponding tokenized json files
    print('| Loading tokenized json files ...')
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(os.path.join(args.raw_path, '*.json')):
        real_name = os.path.splitext(os.path.basename(f))
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    # convert to target lines json file
    print('| Converting to line-based json files ...')
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = os.path.join(args.save_path,
                                       "{}.{}.json".format(corpus_type, p_ct))
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []
        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = os.path.join(args.save_path,
                                   "{}.{}.json".format(corpus_type, p_ct))
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []

    print('| Finish formating to lines-based json files !')
Exemplo n.º 9
0
def format_to_lines(args):
    if not os.path.isdir(args.map_path):
        os.makedirs(args.map_path)
    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    data_splitter = SplitRawFiles(args.raw_path, args.map_path)
    data_splitter.get_and_split_filenames()
    data_splitter.save_fnames_to_corresponding_files()
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            # temp.append(hashhex(line.strip()))
            temp.append(line)
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    i=0
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        # real_name = f.split('/')[-1].split('.')[0]
        # real_name = hashhex(f.split('/')[-1].split('.')[0])
        real_name = f.split('/')[-1]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)
        i+=1
        # if i > 100:
        #     break
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    # import ipdb; ipdb.set_trace()
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                # import ipdb; ipdb.set_trace()
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 10
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        if args.map_on and args.map_path != 'empty':
            for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
                temp.append(hashhex(line.strip()))
            corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
        else:
            tr, va, te = manual_corp_assign(args)
            corpus_mapping['valid'] = va
            corpus_mapping['test'] = te
            corpus_mapping['train'] = tr
    train_files, valid_files, test_files = [], [], []
    # path = glob.glob(pjoin(args.raw_path, '*.json')) # sh hinzu
    # if len(path) < 1:
     #   path = glob.glob(pjoin(os.getcwd() + '\\' + args.raw_path, '*.json'))
     #   print(os.getcwd() + '\\' + args.raw_path)
    for f in glob.glob(pjoin(args.raw_path, '*.json')): # sh geändert
        if args.map_on and args.map_path != 'empty':
            real_name = f.split('\\')[-1].split('.')[0] #  SH geändert real_name = f.split('/')[-1].split('.')[0]
        else:
            real_name = f
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 11
0
def format_to_lines(args):
    # corpus_mapping = {}
    # for corpus_type in ['valid', 'test', 'train']:
    #     temp = []
    #     for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
    #         temp.append(hashhex(line.strip()))
    #     corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []

    # 随机划分数据集,train:valid:test = 8:1:1
    import random
    random.seed(1)
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        # real_name = f.split('/')[-1].split('.')[0]
        # if (real_name in corpus_mapping['valid']):
        #     valid_files.append(f)
        # elif (real_name in corpus_mapping['test']):
        #     test_files.append(f)
        # elif (real_name in corpus_mapping['train']):
        #     train_files.append(f)
        n = random.random()
        if n <= 0.1:
            valid_files.append(f)
        elif n <= 0.2:
            test_files.append(f)
        else:
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 12
0
def format_to_lines_tfds(args):
    """ Formats source text and target text as pt file. """

    tokenized_sub_dirs = os.listdir(args.raw_path)
    dataset_name = os.path.dirname(args.save_path).split('/')[-1]

    # Make directory
    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    # Create file list for each split directory
    corpora = {}
    for tokenized_sub_dir in tokenized_sub_dirs:
        path = pjoin(args.raw_path, tokenized_sub_dir)
        files = []
        for f in glob.glob(pjoin(path, '*.json')):
            files.append(f)
        corpora[tokenized_sub_dir] = files
        files = []

    for corpus_type in tokenized_sub_dirs:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            # NOTE: save files according to shard_size
            if (len(dataset) >= args.shard_size):
                if (corpus_type == 'validation'):
                    type_name = 'valid'
                else:
                    type_name = corpus_type
                pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name,
                                                       p_ct)
                with open(pjoin(args.save_path, pt_file), 'w') as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []
        pool.close()
        pool.join()

        # For the last few data (< shard size)
        if (len(dataset) > 0):
            if (corpus_type == 'validation'):
                type_name = 'valid'
            else:
                type_name = corpus_type
            pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name,
                                                   p_ct)
            with open(pjoin(args.save_path, pt_file), 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 13
0
def main():
    with open('char.json') as f:
        d = json.loads(f.read())

    result = {}

    pool = Pool(4)

    for k, d in tqdm(pool.imap_unordered(task, d.keys())):
        result[k] = d

    with open('wubi.json', 'w') as f:
        f.write(json.dumps(result))
Exemplo n.º 14
0
def format_to_lines(args):
    '''
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    '''
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        #real_name = f.split('/')[-1].split('.')[0]
        f = str(f)
        #
        if (f[97] == "a"):
            valid_files.append(f)
        elif (f[97] == "e"):
            test_files.append(f)
        elif (f[97] == "r"):
            train_files.append(f)
        # else:
        #     train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                #pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                pt_file = args.save_path + corpus_type + str(p_ct) + ".json"
                #print(pt_file)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = args.save_path + corpus_type + str(p_ct) + ".json"
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 15
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(line.strip())
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    logger.info("txt read finished")
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)
        else:
            logger.info("not in mapping file", f)
            train_files.append(f)
    logger.info("data split over")
    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 16
0
def format_json(args):
    corpus_mapping = {}
    type_list = ['valid', 'test', 'train']

    for corpus_type in type_list:
        temp = []
        for line in open(join(args.map_path,
                              'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}

    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(join(args.tokenized_path, '*.json')):
        real_name = os.path.basename(f).split('.')[0]
        if real_name in corpus_mapping['valid']:
            valid_files.append(f)
        elif real_name in corpus_mapping['test']:
            test_files.append(f)
        elif real_name in corpus_mapping['train']:
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in type_list:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        data_set = []
        p_ct = 0
        for d in pool.imap_unordered(_format_json, a_lst):
            data_set.append(d)
            if len(data_set) > args.shard_size:
                pt_file = "{:s}.{:s}.{:d}.json".format(args.json_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(data_set))
                    p_ct += 1
                    data_set = []
                    print("saving ...")

        pool.close()
        pool.join()

        if len(data_set) > 0:
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(data_set))
                p_ct += 1
Exemplo n.º 17
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        print("Real name is:", real_name)
#         print("this needs to equal:")
#         print("corpus mapping of valid/test/train:", corpus_mapping['valid'])
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)
        else:                                   # Bryan edit this out later (original)
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 18
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ["valid", "test", "train"]:
        temp = []
        for line in open(
                pjoin(args.map_path, "mapping_" + corpus_type + ".txt")):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, "*.json")):
        real_name = f.split("/")[-1].split(".")[0]
        if real_name in corpus_mapping["valid"]:
            valid_files.append(f)
        elif real_name in corpus_mapping["test"]:
            test_files.append(f)
        elif real_name in corpus_mapping["train"]:
            train_files.append(f)
        # else:
        #     train_files.append(f)

    corpora = {"train": train_files, "valid": valid_files, "test": test_files}
    for corpus_type in ["train", "valid", "test"]:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if len(dataset) > args.shard_size:
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, "w") as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if len(dataset) > 0:
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, "w") as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 19
0
def format_to_lines(args, split_ratio=[0.8, 0.1, 0.1]):
    output_dir = os.path.dirname(args.save_path)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    file_list = os.listdir(args.raw_path)
    file_list.sort(
        key=lambda f: (datetime.strptime(f.rsplit("_", 1)[0], '%Y_%m_%d'),
                       int(f.rsplit("_", 1)[1].split(".")[0])))
    file_list = ["%s/%s" % (args.raw_path, f) for f in file_list]
    #print(file_list)
    train_count, valid_count, test_count = [
        round(len(file_list) * x) for x in split_ratio
    ]
    print(train_count, valid_count, test_count)

    train_files = file_list[:train_count]
    valid_files = file_list[train_count:train_count + valid_count]
    test_files = file_list[train_count + valid_count:]

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            #randomly assigned the entries in a_lst to different processors in the pool
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 20
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            #randomly assigned the entries in a_lst to different processors in the pool
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 21
0
 def get_text(self):
     if self.filter:
         print("Loading words to keep file...")
         with open(self.words_to_keep_path) as input_file:
             for line in input_file:
                 word = line.strip()
                 self.words_to_keep.add(word)
     print("Parsing text body of all files...", flush=True)
     pool = Pool(self.workers)
     chunksize = len(self.files) // self.workers // 10
     with tqdm(total=len(self.files), leave=self.debug) as pbar:
         for _ in pool.imap_unordered(self.parse_file,
                                      self.files,
                                      chunksize=chunksize or 1):
             pbar.update()
     pool.close()
     pool.join()
Exemplo n.º 22
0
def format_to_lines(args):
    train_files, valid_files, test_files, new_files = [], [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        fn = f.split('/')[-1].split('.')[0]
        if fn.startswith('train'):
            train_files.append(f)
        elif fn.startswith('valid'):
            valid_files.append(f)
        elif fn.startswith('test'):
            test_files.append(f)
        elif fn.startswith('new'):
            new_files.append(f)

    corpora = {
        'train': train_files,
        'valid': valid_files,
        'test': test_files,
        'new': new_files
    }
    for corpus_type in ['train', 'valid', 'test', 'new']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 23
0
def format_to_lines(args):
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    cur = 0
    valid_test_ratio = 0.01
    all_size = len(glob.glob(pjoin(args.raw_path, '*.json')))
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        if cur < valid_test_ratio * all_size:
            valid_files.append(f)
        elif cur < valid_test_ratio * 2 * all_size:
            test_files.append(f)
        else:
            train_files.append(f)
        cur += 1

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 24
0
def imap_progress(f, args, threads = 1,
                  overwrite = True, overwrite_last = True, return_output = True,
                  msg = lambda curr, last: f"{curr}/{last} done.", lvl = 0, quiet = False):
    printi = make_local_print(quiet = False, printf = make_print_preindent(lvl + 1))
    pool = Pool(threads)
    total = len(args)
    output = []
    def print_update(curr):
        if overwrite_last or (overwrite and curr < total):
            printi(msg(curr, total), overwrite = True)
        else:
            printi(msg(curr, total), overwrite = False)
    print_update(0)
    for i, result in enumerate(pool.imap_unordered(f, args), 1):
        output.append(result)
        print_update(i)
    pool.close()
    return output if return_output else None
Exemplo n.º 25
0
def format_xsum_to_lines(args):
    if (args.dataset != ''):
        datasets = [args.dataset]
    else:
        datasets = ['train', 'test', 'valid']

    if args.data_split_json_is_full_path:
        corpus_mapping = json.load(open(args.data_split_json))
    else:
        corpus_mapping = json.load(
            open(pjoin(args.raw_path, args.data_split_json)))

    for corpus_type in datasets:
        mapped_fnames = corpus_mapping[corpus_type]
        root_src = pjoin(args.raw_path, 'restbody')
        root_tgt = pjoin(args.raw_path, 'firstsentence')
        # realnames = [fname.split('.')[0] for fname in os.listdir(root_src)]
        realnames = mapped_fnames

        a_lst = [(root_src, root_tgt, n) for n in realnames]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_xsum_to_lines, a_lst):
            if (d is None):
                continue
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            with open(pt_file, 'w') as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 26
0
def format_to_lines(args):
    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
    corpus_mapping = {}
    for corpus_type in ['valid', 'test', 'train']:
        temp = []
        for line in open(
                pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')):
            temp.append(hashhex(line.strip()))
        corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp}
    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = os.path.splitext(os.path.splitext(
            os.path.basename(f))[0])[0]
        if (real_name in corpus_mapping['valid']):
            valid_files.append(f)
        elif (real_name in corpus_mapping['test']):
            test_files.append(f)
        elif (real_name in corpus_mapping['train']):
            train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path,
                                                       corpus_type, p_ct)
                json.dump(dataset, open(pt_file, 'w'), indent=2)
                p_ct += 1
                dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)
            json.dump(dataset, open(pt_file, 'w'), indent=2)
            p_ct += 1
def format_to_lines(args):

    corpus_mapping = {"valid": ["valid"], "train": ["train"], "test": ["test"]}
    corpus_type = ['valid', 'test', 'train']

    train_files, valid_files, test_files = [], [], []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        real_name = f.split('/')[-1].split('.')[0]
        if (real_name in corpus_mapping["valid"]):
            valid_files.append(f)
        elif (real_name in corpus_mapping["test"]):
            test_files.append(f)
        elif (real_name in corpus_mapping["train"]):
            train_files.append(f)
        # else:
        #     train_files.append(f)

    corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}
    print(corpora)
    for corpus_type in ['train', 'valid', 'test']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset += d

        pool.close()
        pool.join()

        div, mod = divmod(len(dataset), args.shard_size)

        n_iter = div if mod == 0 else div + 1

        for p_ct in tqdm.tqdm(range(n_iter), desc="Shard Iter: "):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type,
                                                   p_ct)

            with open(pt_file, 'w') as save:
                shard_size_dataset = dataset[:args.shard_size]
                save.write(json.dumps(shard_size_dataset, ensure_ascii=False))
                dataset = dataset[args.shard_size:]
Exemplo n.º 28
0
def test_multiprocess_function () :
    """Test parallel processnig with multiprocess
    """
    logger =    getLogger ("ostap.test_multiprocess_function")
    logger.info ('Test job submission with %s' %  multiprocess ) 
    
    if not dill :
        logger.error ( "dill is not available" )
        return
        
    if not multiprocess :
        logger.error ( "multiprocess is not available" )
        return 
        
    from ostap.core.known_issues import DILL_ROOT_issue
    if DILL_ROOT_issue : 
        logger.warning ("test is disabled for Python %s (dill/ROOT issue)" )
        return
    
    ncpus = multiprocess.cpu_count() 
    
    from multiprocess import Pool
    
    pool = Pool  ( ncpus ) 
    
    jobs = pool.imap_unordered ( make_histos , zip ( count() ,  inputs ) )
    
    result = None 
    for h in progress_bar ( jobs , max_value = len ( inputs ) ) :
        if not result  : result = h
        else           : result.Add ( h )

    pool.close ()
    pool.join  ()
    
    logger.info ( "Histogram is %s" % result.dump ( 80 , 20 ) )
    logger.info ( "Entries  %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) 
    
    result.Draw (   ) 
    time.sleep  ( 2 )

    return result 
Exemplo n.º 29
0
def _df_to_jsons(df, prefix, to_dir, n_cpus, clients):

    NUM_DOCS_IN_ONE_FILE = 1000
    first_row_idx_list = list(range(0, len(df), NUM_DOCS_IN_ONE_FILE))
    digits_num = len(str(len(df)))
    client_list = clients * ((len(first_row_idx_list) // len(clients)) + 1)

    df_list = []
    file_name_list = []
    for i, first_row_idx in enumerate(first_row_idx_list):

        if i == len(first_row_idx_list) - 1:  # last element
            last_row_idx = len(df)
            df_list.append(df[first_row_idx:])
        else:
            last_row_idx = first_row_idx + NUM_DOCS_IN_ONE_FILE
            df_list.append(df[first_row_idx:last_row_idx])

        ## 정렬을 위해 파일이름 앞에 0 채워주기
        start_row_idx_str = (digits_num - len(str(first_row_idx))) * '0' + str(
            first_row_idx)
        last_row_idx_str = (digits_num - len(
            str(last_row_idx - 1))) * '0' + str(last_row_idx - 1)
        file_name = f'{to_dir}/{prefix}_{start_row_idx_str}_{last_row_idx_str}.json'
        file_name_list.append(file_name)

    print(
        f"----------{prefix} start({len(first_row_idx_list)} files)------------"
    )
    # with Pool(processes=n_cpus) as pool:
    pool = Pool(n_cpus)
    port_idx = list(range(n_cpus))
    for result in pool.imap_unordered(
            _df_to_json, zip(df_list, file_name_list,
                             client_list)):  # 길이가 작은거에 맞춰서 중단됨.
        print(result)
    pool.close()
    pool.join()
    # for params in tqdm(zip(dfs_split, idx_list, file_name_list)):
    #     _df_to_json(params)
    print(
        f"----------{prefix} end({len(first_row_idx_list)} files)------------")
def format_xsum_to_lines(args):
    if args.dataset != "":
        datasets = [args.dataset]
    else:
        datasets = ["train", "test", "valid"]

    corpus_mapping = json.load(
        open(pjoin(args.raw_path, "XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json"))
    )

    for corpus_type in datasets:
        mapped_fnames = corpus_mapping[corpus_type]
        root_src = pjoin(args.raw_path, "restbody")
        root_tgt = pjoin(args.raw_path, "firstsentence")
        # realnames = [fname.split('.')[0] for fname in os.listdir(root_src)]
        realnames = mapped_fnames

        a_lst = [(root_src, root_tgt, n) for n in realnames]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_xsum_to_lines, a_lst):
            if d is None:
                continue
            dataset.append(d)
            if len(dataset) > args.shard_size:
                pt_file = "{:s}.{:s}.{:d}.json".format(
                    args.save_path, corpus_type, p_ct
                )
                with open(pt_file, "w") as save:
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if len(dataset) > 0:
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, "w") as save:
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
Exemplo n.º 31
0
def test():
    print('cpuCount() = %d\n' % cpuCount())
    
    #
    # Create pool
    #
    
    PROCESSES = 4
    print('Creating pool with %d processes\n' % PROCESSES)
    pool = Pool(PROCESSES)    

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.apply_async(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)

    print('Ordered results using pool.apply_async():')
    for r in results:
        print('\t', r.get())
    print()

    print('Ordered results using pool.imap():')
    for x in imap_it:
        print('\t', x)
    print()

    print('Unordered results using pool.imap_unordered():')
    for x in imap_unordered_it:
        print('\t', x)
    print()

    print('Ordered results using pool.map() --- will block till complete:')
    for x in pool.map(calculatestar, TASKS):
        print('\t', x)
    print()

    #
    # Simple benchmarks
    #

    N = 100000
    print('def pow3(x): return x**3')
    
    t = time.time()
    A = list(map(pow3, xrange(N)))
    print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))
    
    t = time.time()
    B = pool.map(pow3, xrange(N))
    print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N//8))
    print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t))
    
    assert A == B == C, (len(A), len(B), len(C))
    print()
    
    L = [None] * 1000000
    print('def noop(x): pass')
    print('L = [None] * 1000000')
    
    t = time.time()
    A = list(map(noop, L))
    print('\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))
    
    t = time.time()
    B = pool.map(noop, L)
    print('\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L)//8))
    print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    del A, B, C, L

    #
    # Test error handling
    #

    print('Testing error handling:')

    try:
        print(pool.apply(f, (5,)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.apply()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(pool.map(f, range(10)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.map()')
    else:
        raise AssertionError('expected ZeroDivisionError')
            
    try:
        print(list(pool.imap(f, range(10))))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from list(pool.imap())')
    else:
        raise AssertionError('expected ZeroDivisionError')

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError('expected ZeroDivisionError')
            
    assert i == 9
    print('\tGot ZeroDivisionError as expected from IMapIterator.next()')
    print()
    
    #
    # Testing timeouts
    #
    
    print('Testing ApplyResult.get() with timeout:', end='')
    res = pool.apply_async(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    print('Testing IMapIterator.next() with timeout:', end='')
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()
            
    #
    # Testing callback
    #

    print('Testing callback:')
    
    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
        
    r = pool.apply_async(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.map_async(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print('\tcallbacks succeeded\n')
    else:
        print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B))
    
    #
    # Check there are no outstanding tasks
    #
    
    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print('Testing close():')

    for worker in pool._pool:
        assert worker.is_alive()

    result = pool.apply_async(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tclose() succeeded\n')

    #
    # Check terminate() method
    #

    print('Testing terminate():')

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tterminate() succeeded\n')

    #
    # Check garbage collection
    #

    print('Testing garbage collection:')

    pool = Pool(2)
    processes = pool._pool
    
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)
    
    for worker in processes:
        assert not worker.is_alive()

    print('\tgarbage collection succeeded\n')