def format_to_lines(args): print("in format lines") train_files = glob.glob(pjoin(args.train_path, './*.json')) valid_files = glob.glob(pjoin(args.valid_path, './*.json')) test_files = glob.glob(pjoin(args.test_path, './*.json')) print("test_files are ", test_files) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) # if (len(dataset) > args.shard_size): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) print( "saved to ", "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)) pool.close() pool.join()
def format_to_lines(args): corpus_mapping = {} train_files = [] for f in glob.glob(pjoin(args.raw_path, '*.json')): train_files.append(f) corpora = {'train': train_files} for corpus_type in ['train']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def _files_dict_to_json(args, train_files, valid_files, test_files): corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def parse_files(self, workers, data_dicts=None): """Parse all files""" print("\n\n### Parsing files ###") os.chdir(self.workdir) # questionable if data_dicts is None: data_dicts = [{"filename": fn.name} for fn in os.scandir(self.textdir)] filequeue = [ { "name": d["filename"], "size": os.path.getsize(self.textdir + d["filename"]), "id": n + 1, "options": d["options"] if "options" in d else {}, "newpath": self.textdir + d["filename"], "raw": self.workdir + d["filename"] + ".raw", "words": self.workdir + d["filename"] + ".words.sorted", "toms": self.workdir + d["filename"] + ".toms", "sortedtoms": self.workdir + d["filename"] + ".toms.sorted", "pages": self.workdir + d["filename"] + ".pages", "refs": self.workdir + d["filename"] + ".refs", "graphics": self.workdir + d["filename"] + ".graphics", "lines": self.workdir + d["filename"] + ".lines", "results": self.workdir + d["filename"] + ".results", } for n, d in enumerate(data_dicts) ] self.raw_files = [f["raw"] + ".lz4" for f in filequeue] self.metadata_hierarchy.append([]) # Adding in doc level metadata for d in data_dicts: for k in list(d.keys()): if k not in self.metadata_fields: self.metadata_fields.append(k) self.metadata_hierarchy[0].append(k) if k not in self.metadata_types: self.metadata_types[k] = "doc" # don't need to check for conflicts, since doc is first. # Adding non-doc level metadata for element_type in self.parser_config["metadata_to_parse"]: if element_type != "page" and element_type != "ref" and element_type != "line": self.metadata_hierarchy.append([]) for param in self.parser_config["metadata_to_parse"][element_type]: if param not in self.metadata_fields: self.metadata_fields.append(param) self.metadata_hierarchy[-1].append(param) if param not in self.metadata_types: self.metadata_types[param] = element_type else: # we have a serious error here! Should raise going forward. pass print("%s: parsing %d files." % (time.ctime(), len(filequeue))) pool = Pool(workers) for results in pool.imap_unordered(self.__parse_file, zip(filequeue, data_dicts)): with open(results, "rb") as proc_fh: vec = pickle.load(proc_fh) # load in the results from the child's parsework() function. self.omax = [max(x, y) for x, y in zip(vec, self.omax)] print("%s: done parsing" % time.ctime())
def format_to_lines(args, raw_path, save_path): # file names: '4fd2a00e8eb7c8105d883bd7.json' name_list = os.listdir(raw_path) for i, name in enumerate(name_list): a_lst = [(pjoin(raw_path, f), args) for f in name_list] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:d}.json".format(save_path, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = os.path.join(save_path, 'test.json') with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_nnsum(args, split_ratio=[0.8, 0.1, 0.1]): ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use for training SummaRunner and other baseline models. label_file: {id}.json {"id":"7f168bcf16ff08b32221d0c3993701dd502de584", "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]} abstract_file: {id}.spl # nnsum paper uses tokenized words joined by space as each sentence, but uncased (both upper and lower case included) input_file: {id}.json {"input": [sent_1, sent_2, ..., sent_n], "id":story_id} sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER, "word_count":word count of sent_i, "sentence_id":i} #sentence_id is from 1 #The fields really used in the model are: "tokens", "text" ''' output_dir = os.path.dirname(args.save_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) file_list = os.listdir(args.raw_path) file_list.sort( key=lambda f: (datetime.strptime(f.rsplit("_", 1)[0], '%Y_%m_%d'), int(f.rsplit("_", 1)[1].split(".")[0]))) file_list = ["%s/%s" % (args.raw_path, f) for f in file_list] #print(file_list) train_count, valid_count, test_count = [ round(len(file_list) * x) for x in split_ratio ] print(train_count, valid_count, test_count) train_files = file_list[:train_count] valid_files = file_list[train_count:train_count + valid_count] test_files = file_list[train_count + valid_count:] corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: data_dir = pathlib.Path(args.save_path) input_dir = data_dir / "nnsum_inputs" / corpus_type label_dir = data_dir / "nnsum_labels" / corpus_type abstracts_dir = data_dir / "human-abstracts" / corpus_type input_dir.mkdir(exist_ok=True, parents=True) # similar to 'mkdir -p' label_dir.mkdir(exist_ok=True, parents=True) abstracts_dir.mkdir(exist_ok=True, parents=True) a_lst = [(f, args, input_dir, abstracts_dir, label_dir) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) result_iter = pool.imap_unordered(_format_to_nnsum, a_lst) num_stories = len(a_lst) #randomly assigned the entries in a_lst to different processors in the pool for idx, result in enumerate(result_iter, 1): print("{}: Writing story {}/{}".format(corpus_type, idx, num_stories), end="\r" if idx < num_stories else "\n", flush=True) pool.close() pool.join()
def format_to_nnsum(args): ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use for training SummaRunner and other baseline models. label_file: {id}.json {"id":"7f168bcf16ff08b32221d0c3993701dd502de584", "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]} abstract_file: {id}.spl # nnsum paper uses tokenized words joined by space as each sentence, but uncased (both upper and lower case included) input_file: {id}.json {"input": [sent_1, sent_2, ..., sent_n], "id":story_id} sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER, "word_count":word count of sent_i, "sentence_id":i} #sentence_id is from 1 #The fields really used in the model are: "tokens", "text" ''' corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: data_dir = pathlib.Path(args.save_path) input_dir = data_dir / "nnsum_inputs" / corpus_type label_dir = data_dir / "nnsum_labels" / corpus_type abstracts_dir = data_dir / "human-abstracts" / corpus_type input_dir.mkdir(exist_ok=True, parents=True) # similar to 'mkdir -p' label_dir.mkdir(exist_ok=True, parents=True) abstracts_dir.mkdir(exist_ok=True, parents=True) a_lst = [(f, args, input_dir, abstracts_dir, label_dir) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) result_iter = pool.imap_unordered(_format_to_nnsum, a_lst) num_stories = len(a_lst) #randomly assigned the entries in a_lst to different processors in the pool for idx, result in enumerate(result_iter, 1): print("{}: Writing story {}/{}".format(corpus_type, idx, num_stories), end="\r" if idx < num_stories else "\n", flush=True) pool.close() pool.join()
def format_to_lines(args): # load mapping files print('| Loading mapping files ...') corpus_mapping = {"train": [], "valid": [], "test": []} for corpus_type in ['valid', 'test', 'train']: temp = [] mapping_fp = os.path.join(args.map_path, "mapping_{}.txt".format(corpus_type)) if not os.path.exists(mapping_fp): print( "Mapping file '{}' doesn't exist. Skip the type of mapping files." .format(mapping_fp)) continue for line in open(mapping_fp): temp.append(hashhex(line.strip())) temp.append(line.strip()) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} # load corresponding tokenized json files print('| Loading tokenized json files ...') train_files, valid_files, test_files = [], [], [] for f in glob.glob(os.path.join(args.raw_path, '*.json')): real_name = os.path.splitext(os.path.basename(f)) if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) # convert to target lines json file print('| Converting to line-based json files ...') corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = os.path.join(args.save_path, "{}.{}.json".format(corpus_type, p_ct)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = os.path.join(args.save_path, "{}.{}.json".format(corpus_type, p_ct)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] print('| Finish formating to lines-based json files !')
def format_to_lines(args): if not os.path.isdir(args.map_path): os.makedirs(args.map_path) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) data_splitter = SplitRawFiles(args.raw_path, args.map_path) data_splitter.get_and_split_filenames() data_splitter.save_fnames_to_corresponding_files() corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): # temp.append(hashhex(line.strip())) temp.append(line) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] i=0 for f in glob.glob(pjoin(args.raw_path, '*.json')): # real_name = f.split('/')[-1].split('.')[0] # real_name = hashhex(f.split('/')[-1].split('.')[0]) real_name = f.split('/')[-1] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) i+=1 # if i > 100: # break corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} # import ipdb; ipdb.set_trace() for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) # import ipdb; ipdb.set_trace() with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] if args.map_on and args.map_path != 'empty': for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} else: tr, va, te = manual_corp_assign(args) corpus_mapping['valid'] = va corpus_mapping['test'] = te corpus_mapping['train'] = tr train_files, valid_files, test_files = [], [], [] # path = glob.glob(pjoin(args.raw_path, '*.json')) # sh hinzu # if len(path) < 1: # path = glob.glob(pjoin(os.getcwd() + '\\' + args.raw_path, '*.json')) # print(os.getcwd() + '\\' + args.raw_path) for f in glob.glob(pjoin(args.raw_path, '*.json')): # sh geändert if args.map_on and args.map_path != 'empty': real_name = f.split('\\')[-1].split('.')[0] # SH geändert real_name = f.split('/')[-1].split('.')[0] else: real_name = f if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): # corpus_mapping = {} # for corpus_type in ['valid', 'test', 'train']: # temp = [] # for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): # temp.append(hashhex(line.strip())) # corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] # 随机划分数据集,train:valid:test = 8:1:1 import random random.seed(1) for f in glob.glob(pjoin(args.raw_path, '*.json')): # real_name = f.split('/')[-1].split('.')[0] # if (real_name in corpus_mapping['valid']): # valid_files.append(f) # elif (real_name in corpus_mapping['test']): # test_files.append(f) # elif (real_name in corpus_mapping['train']): # train_files.append(f) n = random.random() if n <= 0.1: valid_files.append(f) elif n <= 0.2: test_files.append(f) else: train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines_tfds(args): """ Formats source text and target text as pt file. """ tokenized_sub_dirs = os.listdir(args.raw_path) dataset_name = os.path.dirname(args.save_path).split('/')[-1] # Make directory if not os.path.isdir(args.save_path): os.makedirs(args.save_path) # Create file list for each split directory corpora = {} for tokenized_sub_dir in tokenized_sub_dirs: path = pjoin(args.raw_path, tokenized_sub_dir) files = [] for f in glob.glob(pjoin(path, '*.json')): files.append(f) corpora[tokenized_sub_dir] = files files = [] for corpus_type in tokenized_sub_dirs: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) # NOTE: save files according to shard_size if (len(dataset) >= args.shard_size): if (corpus_type == 'validation'): type_name = 'valid' else: type_name = corpus_type pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name, p_ct) with open(pjoin(args.save_path, pt_file), 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() # For the last few data (< shard size) if (len(dataset) > 0): if (corpus_type == 'validation'): type_name = 'valid' else: type_name = corpus_type pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name, p_ct) with open(pjoin(args.save_path, pt_file), 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def main(): with open('char.json') as f: d = json.loads(f.read()) result = {} pool = Pool(4) for k, d in tqdm(pool.imap_unordered(task, d.keys())): result[k] = d with open('wubi.json', 'w') as f: f.write(json.dumps(result))
def format_to_lines(args): ''' corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} ''' train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): #real_name = f.split('/')[-1].split('.')[0] f = str(f) # if (f[97] == "a"): valid_files.append(f) elif (f[97] == "e"): test_files.append(f) elif (f[97] == "r"): train_files.append(f) # else: # train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): #pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) pt_file = args.save_path + corpus_type + str(p_ct) + ".json" #print(pt_file) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = args.save_path + corpus_type + str(p_ct) + ".json" with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(line.strip()) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} logger.info("txt read finished") train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) else: logger.info("not in mapping file", f) train_files.append(f) logger.info("data split over") corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_json(args): corpus_mapping = {} type_list = ['valid', 'test', 'train'] for corpus_type in type_list: temp = [] for line in open(join(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(join(args.tokenized_path, '*.json')): real_name = os.path.basename(f).split('.')[0] if real_name in corpus_mapping['valid']: valid_files.append(f) elif real_name in corpus_mapping['test']: test_files.append(f) elif real_name in corpus_mapping['train']: train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in type_list: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) data_set = [] p_ct = 0 for d in pool.imap_unordered(_format_json, a_lst): data_set.append(d) if len(data_set) > args.shard_size: pt_file = "{:s}.{:s}.{:d}.json".format(args.json_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(data_set)) p_ct += 1 data_set = [] print("saving ...") pool.close() pool.join() if len(data_set) > 0: pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(data_set)) p_ct += 1
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] print("Real name is:", real_name) # print("this needs to equal:") # print("corpus mapping of valid/test/train:", corpus_mapping['valid']) if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) else: # Bryan edit this out later (original) train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ["valid", "test", "train"]: temp = [] for line in open( pjoin(args.map_path, "mapping_" + corpus_type + ".txt")): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, "*.json")): real_name = f.split("/")[-1].split(".")[0] if real_name in corpus_mapping["valid"]: valid_files.append(f) elif real_name in corpus_mapping["test"]: test_files.append(f) elif real_name in corpus_mapping["train"]: train_files.append(f) # else: # train_files.append(f) corpora = {"train": train_files, "valid": valid_files, "test": test_files} for corpus_type in ["train", "valid", "test"]: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if len(dataset) > args.shard_size: pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, "w") as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if len(dataset) > 0: pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, "w") as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args, split_ratio=[0.8, 0.1, 0.1]): output_dir = os.path.dirname(args.save_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) file_list = os.listdir(args.raw_path) file_list.sort( key=lambda f: (datetime.strptime(f.rsplit("_", 1)[0], '%Y_%m_%d'), int(f.rsplit("_", 1)[1].split(".")[0]))) file_list = ["%s/%s" % (args.raw_path, f) for f in file_list] #print(file_list) train_count, valid_count, test_count = [ round(len(file_list) * x) for x in split_ratio ] print(train_count, valid_count, test_count) train_files = file_list[:train_count] valid_files = file_list[train_count:train_count + valid_count] test_files = file_list[train_count + valid_count:] corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): #randomly assigned the entries in a_lst to different processors in the pool dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): #randomly assigned the entries in a_lst to different processors in the pool dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def get_text(self): if self.filter: print("Loading words to keep file...") with open(self.words_to_keep_path) as input_file: for line in input_file: word = line.strip() self.words_to_keep.add(word) print("Parsing text body of all files...", flush=True) pool = Pool(self.workers) chunksize = len(self.files) // self.workers // 10 with tqdm(total=len(self.files), leave=self.debug) as pbar: for _ in pool.imap_unordered(self.parse_file, self.files, chunksize=chunksize or 1): pbar.update() pool.close() pool.join()
def format_to_lines(args): train_files, valid_files, test_files, new_files = [], [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): fn = f.split('/')[-1].split('.')[0] if fn.startswith('train'): train_files.append(f) elif fn.startswith('valid'): valid_files.append(f) elif fn.startswith('test'): test_files.append(f) elif fn.startswith('new'): new_files.append(f) corpora = { 'train': train_files, 'valid': valid_files, 'test': test_files, 'new': new_files } for corpus_type in ['train', 'valid', 'test', 'new']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] cur = 0 valid_test_ratio = 0.01 all_size = len(glob.glob(pjoin(args.raw_path, '*.json'))) for f in glob.glob(pjoin(args.raw_path, '*.json')): if cur < valid_test_ratio * all_size: valid_files.append(f) elif cur < valid_test_ratio * 2 * all_size: test_files.append(f) else: train_files.append(f) cur += 1 corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def imap_progress(f, args, threads = 1, overwrite = True, overwrite_last = True, return_output = True, msg = lambda curr, last: f"{curr}/{last} done.", lvl = 0, quiet = False): printi = make_local_print(quiet = False, printf = make_print_preindent(lvl + 1)) pool = Pool(threads) total = len(args) output = [] def print_update(curr): if overwrite_last or (overwrite and curr < total): printi(msg(curr, total), overwrite = True) else: printi(msg(curr, total), overwrite = False) print_update(0) for i, result in enumerate(pool.imap_unordered(f, args), 1): output.append(result) print_update(i) pool.close() return output if return_output else None
def format_xsum_to_lines(args): if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'test', 'valid'] if args.data_split_json_is_full_path: corpus_mapping = json.load(open(args.data_split_json)) else: corpus_mapping = json.load( open(pjoin(args.raw_path, args.data_split_json))) for corpus_type in datasets: mapped_fnames = corpus_mapping[corpus_type] root_src = pjoin(args.raw_path, 'restbody') root_tgt = pjoin(args.raw_path, 'firstsentence') # realnames = [fname.split('.')[0] for fname in os.listdir(root_src)] realnames = mapped_fnames a_lst = [(root_src, root_tgt, n) for n in realnames] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_xsum_to_lines, a_lst): if (d is None): continue dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def format_to_lines(args): os.makedirs(os.path.dirname(args.save_path), exist_ok=True) corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = os.path.splitext(os.path.splitext( os.path.basename(f))[0])[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) json.dump(dataset, open(pt_file, 'w'), indent=2) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) json.dump(dataset, open(pt_file, 'w'), indent=2) p_ct += 1
def format_to_lines(args): corpus_mapping = {"valid": ["valid"], "train": ["train"], "test": ["test"]} corpus_type = ['valid', 'test', 'train'] train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping["valid"]): valid_files.append(f) elif (real_name in corpus_mapping["test"]): test_files.append(f) elif (real_name in corpus_mapping["train"]): train_files.append(f) # else: # train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} print(corpora) for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset += d pool.close() pool.join() div, mod = divmod(len(dataset), args.shard_size) n_iter = div if mod == 0 else div + 1 for p_ct in tqdm.tqdm(range(n_iter), desc="Shard Iter: "): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: shard_size_dataset = dataset[:args.shard_size] save.write(json.dumps(shard_size_dataset, ensure_ascii=False)) dataset = dataset[args.shard_size:]
def test_multiprocess_function () : """Test parallel processnig with multiprocess """ logger = getLogger ("ostap.test_multiprocess_function") logger.info ('Test job submission with %s' % multiprocess ) if not dill : logger.error ( "dill is not available" ) return if not multiprocess : logger.error ( "multiprocess is not available" ) return from ostap.core.known_issues import DILL_ROOT_issue if DILL_ROOT_issue : logger.warning ("test is disabled for Python %s (dill/ROOT issue)" ) return ncpus = multiprocess.cpu_count() from multiprocess import Pool pool = Pool ( ncpus ) jobs = pool.imap_unordered ( make_histos , zip ( count() , inputs ) ) result = None for h in progress_bar ( jobs , max_value = len ( inputs ) ) : if not result : result = h else : result.Add ( h ) pool.close () pool.join () logger.info ( "Histogram is %s" % result.dump ( 80 , 20 ) ) logger.info ( "Entries %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) result.Draw ( ) time.sleep ( 2 ) return result
def _df_to_jsons(df, prefix, to_dir, n_cpus, clients): NUM_DOCS_IN_ONE_FILE = 1000 first_row_idx_list = list(range(0, len(df), NUM_DOCS_IN_ONE_FILE)) digits_num = len(str(len(df))) client_list = clients * ((len(first_row_idx_list) // len(clients)) + 1) df_list = [] file_name_list = [] for i, first_row_idx in enumerate(first_row_idx_list): if i == len(first_row_idx_list) - 1: # last element last_row_idx = len(df) df_list.append(df[first_row_idx:]) else: last_row_idx = first_row_idx + NUM_DOCS_IN_ONE_FILE df_list.append(df[first_row_idx:last_row_idx]) ## 정렬을 위해 파일이름 앞에 0 채워주기 start_row_idx_str = (digits_num - len(str(first_row_idx))) * '0' + str( first_row_idx) last_row_idx_str = (digits_num - len( str(last_row_idx - 1))) * '0' + str(last_row_idx - 1) file_name = f'{to_dir}/{prefix}_{start_row_idx_str}_{last_row_idx_str}.json' file_name_list.append(file_name) print( f"----------{prefix} start({len(first_row_idx_list)} files)------------" ) # with Pool(processes=n_cpus) as pool: pool = Pool(n_cpus) port_idx = list(range(n_cpus)) for result in pool.imap_unordered( _df_to_json, zip(df_list, file_name_list, client_list)): # 길이가 작은거에 맞춰서 중단됨. print(result) pool.close() pool.join() # for params in tqdm(zip(dfs_split, idx_list, file_name_list)): # _df_to_json(params) print( f"----------{prefix} end({len(first_row_idx_list)} files)------------")
def format_xsum_to_lines(args): if args.dataset != "": datasets = [args.dataset] else: datasets = ["train", "test", "valid"] corpus_mapping = json.load( open(pjoin(args.raw_path, "XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.json")) ) for corpus_type in datasets: mapped_fnames = corpus_mapping[corpus_type] root_src = pjoin(args.raw_path, "restbody") root_tgt = pjoin(args.raw_path, "firstsentence") # realnames = [fname.split('.')[0] for fname in os.listdir(root_src)] realnames = mapped_fnames a_lst = [(root_src, root_tgt, n) for n in realnames] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_xsum_to_lines, a_lst): if d is None: continue dataset.append(d) if len(dataset) > args.shard_size: pt_file = "{:s}.{:s}.{:d}.json".format( args.save_path, corpus_type, p_ct ) with open(pt_file, "w") as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if len(dataset) > 0: pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, "w") as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def test(): print('cpuCount() = %d\n' % cpuCount()) # # Create pool # PROCESSES = 4 print('Creating pool with %d processes\n' % PROCESSES) pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.apply_async(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imap_unordered(calculatestar, TASKS) print('Ordered results using pool.apply_async():') for r in results: print('\t', r.get()) print() print('Ordered results using pool.imap():') for x in imap_it: print('\t', x) print() print('Unordered results using pool.imap_unordered():') for x in imap_unordered_it: print('\t', x) print() print('Ordered results using pool.map() --- will block till complete:') for x in pool.map(calculatestar, TASKS): print('\t', x) print() # # Simple benchmarks # N = 100000 print('def pow3(x): return x**3') t = time.time() A = list(map(pow3, xrange(N))) print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() B = pool.map(pow3, xrange(N)) print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N//8)) print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() L = [None] * 1000000 print('def noop(x): pass') print('L = [None] * 1000000') t = time.time() A = list(map(noop, L)) print('\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() B = pool.map(noop, L) print('\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L)//8)) print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() del A, B, C, L # # Test error handling # print('Testing error handling:') try: print(pool.apply(f, (5,))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.apply()') else: raise AssertionError('expected ZeroDivisionError') try: print(pool.map(f, range(10))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.map()') else: raise AssertionError('expected ZeroDivisionError') try: print(list(pool.imap(f, range(10)))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from list(pool.imap())') else: raise AssertionError('expected ZeroDivisionError') it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError('expected ZeroDivisionError') assert i == 9 print('\tGot ZeroDivisionError as expected from IMapIterator.next()') print() # # Testing timeouts # print('Testing ApplyResult.get() with timeout:', end='') res = pool.apply_async(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print() print() print('Testing IMapIterator.next() with timeout:', end='') it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print() print() # # Testing callback # print('Testing callback:') A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.apply_async(mul, (7, 8), callback=A.append) r.wait() r = pool.map_async(pow3, range(10), callback=A.extend) r.wait() if A == B: print('\tcallbacks succeeded\n') else: print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print('Testing close():') for worker in pool._pool: assert worker.is_alive() result = pool.apply_async(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.is_alive() print('\tclose() succeeded\n') # # Check terminate() method # print('Testing terminate():') pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.is_alive() print('\tterminate() succeeded\n') # # Check garbage collection # print('Testing garbage collection:') pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.is_alive() print('\tgarbage collection succeeded\n')