def dump_docs(dmp): """Документы dmp: Dumper object """ folder = os.path.join('dump', 'docs') os.makedirs(folder, exist_ok=True) print('[получение списка документов]') docs = dmp._vk.docs.get() print('Сохраненние документов:') if docs['count'] == 0: print(' 0/0 (total: {})'.format(len(next(os.walk(folder))[2]))) else: objs = [] for d in docs['items']: objs.append({ 'url': d['url'], 'name': d['title'] + '_' + str(d['id']), 'ext': d['ext'] }) print(' .../{}'.format(docs['count']), end='\r') with Pool(dmp._settings['POOL_PROCESSES']) as pool: res = pool.starmap(copy_func(dmp._download), zip(itertools.repeat(dmp.__class__), objs, itertools.repeat(folder))) print('\x1b[2K {}/{} (total: {})'.format(sum(filter(None, res)), len(objs), len(next(os.walk(folder))[2])))
def run_mcts(self, env, runs_per_round): """ Runs all batched MCTS instances concurrently on the STOVE model :param env: (STOVE) a STOVE instance representing the env :param runs_per_round: (int) the number of MCTS expansions to perform :return: an array of next actions """ pool = Pool(self.num_mcts) for i in range(runs_per_round): start = time.time() result = pool.imap(select, self.trees) all_states = [] all_zs = [] for state, z in result: all_states.append(state) all_zs.append(z) # expand all mcts by applying all next actions batched on all mcts zs expansion_actions = multi_one_hot(range(self.actions), self.actions) expansion_actions = expansion_actions.view(self.actions, 1, self.actions) expansion_actions = expansion_actions.repeat(self.num_mcts, 1, 1).to('cuda') new_zs, r = env.rollout(tile(torch.cat(all_zs, 0), 0, self.actions).to('cuda'), num=1, actions=expansion_actions, appearance=tile(self.obj_app, 0, self.actions).to('cuda')) # rollout all new expanded nodes in parallel random_rollout_actions = np.random.randint( self.actions, size=(self.actions * self.num_mcts * self.max_rollout * 2, )) random_rollout_actions = multi_one_hot(random_rollout_actions, self.actions) random_rollout_actions = random_rollout_actions.view( self.num_mcts * self.actions, self.max_rollout * 2, self.actions) _, r_rollout = env.rollout( new_zs[:, -1].to('cuda'), num=self.max_rollout * 2, actions=random_rollout_actions, appearance=tile(self.obj_app, 0, self.actions).to('cuda')) for j, mcts in enumerate(self.trees): low = j * self.actions high = (j + 1) * self.actions mcts.backpropagate(new_zs[low:high], r[low:high], r_rollout[low:high], all_states[j]) pool.close() actions = [] for i in range(self.num_mcts): counts = [ self.trees[i].Nsa['r' + str(a)] for a in range(self.actions) ] actions.append(np.argmax(counts)) return actions
def format_to_nnsum(args, split_ratio=[0.8, 0.1, 0.1]): ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use for training SummaRunner and other baseline models. label_file: {id}.json {"id":"7f168bcf16ff08b32221d0c3993701dd502de584", "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]} abstract_file: {id}.spl # nnsum paper uses tokenized words joined by space as each sentence, but uncased (both upper and lower case included) input_file: {id}.json {"input": [sent_1, sent_2, ..., sent_n], "id":story_id} sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER, "word_count":word count of sent_i, "sentence_id":i} #sentence_id is from 1 #The fields really used in the model are: "tokens", "text" ''' output_dir = os.path.dirname(args.save_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) file_list = os.listdir(args.raw_path) file_list.sort( key=lambda f: (datetime.strptime(f.rsplit("_", 1)[0], '%Y_%m_%d'), int(f.rsplit("_", 1)[1].split(".")[0]))) file_list = ["%s/%s" % (args.raw_path, f) for f in file_list] #print(file_list) train_count, valid_count, test_count = [ round(len(file_list) * x) for x in split_ratio ] print(train_count, valid_count, test_count) train_files = file_list[:train_count] valid_files = file_list[train_count:train_count + valid_count] test_files = file_list[train_count + valid_count:] corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: data_dir = pathlib.Path(args.save_path) input_dir = data_dir / "nnsum_inputs" / corpus_type label_dir = data_dir / "nnsum_labels" / corpus_type abstracts_dir = data_dir / "human-abstracts" / corpus_type input_dir.mkdir(exist_ok=True, parents=True) # similar to 'mkdir -p' label_dir.mkdir(exist_ok=True, parents=True) abstracts_dir.mkdir(exist_ok=True, parents=True) a_lst = [(f, args, input_dir, abstracts_dir, label_dir) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) result_iter = pool.imap_unordered(_format_to_nnsum, a_lst) num_stories = len(a_lst) #randomly assigned the entries in a_lst to different processors in the pool for idx, result in enumerate(result_iter, 1): print("{}: Writing story {}/{}".format(corpus_type, idx, num_stories), end="\r" if idx < num_stories else "\n", flush=True) pool.close() pool.join()
def format_to_nnsum(args): ''' convert data to what nnsum(https://github.com/kedz/nnsum) can use for training SummaRunner and other baseline models. label_file: {id}.json {"id":"7f168bcf16ff08b32221d0c3993701dd502de584", "labels":[1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]} abstract_file: {id}.spl # nnsum paper uses tokenized words joined by space as each sentence, but uncased (both upper and lower case included) input_file: {id}.json {"input": [sent_1, sent_2, ..., sent_n], "id":story_id} sent_i: {"text":original text, "tokens":word list, "pos":postag, "ne":NER, "word_count":word count of sent_i, "sentence_id":i} #sentence_id is from 1 #The fields really used in the model are: "tokens", "text" ''' corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: data_dir = pathlib.Path(args.save_path) input_dir = data_dir / "nnsum_inputs" / corpus_type label_dir = data_dir / "nnsum_labels" / corpus_type abstracts_dir = data_dir / "human-abstracts" / corpus_type input_dir.mkdir(exist_ok=True, parents=True) # similar to 'mkdir -p' label_dir.mkdir(exist_ok=True, parents=True) abstracts_dir.mkdir(exist_ok=True, parents=True) a_lst = [(f, args, input_dir, abstracts_dir, label_dir) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) result_iter = pool.imap_unordered(_format_to_nnsum, a_lst) num_stories = len(a_lst) #randomly assigned the entries in a_lst to different processors in the pool for idx, result in enumerate(result_iter, 1): print("{}: Writing story {}/{}".format(corpus_type, idx, num_stories), end="\r" if idx < num_stories else "\n", flush=True) pool.close() pool.join()
def run(self): """ This functions reads the feature extraction filelist and creates a pool of processes to extract features from distinct files in parallel. It outputs one pymir3 FeatureTrack file per input file. Output is buffered to save memory and defer disk access. .. note:: These keys are expected to be set in the experiment file: * ['general']['feature_extraction_filelist'] * ['general']['scratch_directory'] * ['feature_extraction']['output_buffer_size'] * ['feature_extraction']['worker_extractors'] """ print("Running feature extraction behavior: %s" % self.name) # todo: use metadata file to add labels to track metadata (if available) # deve garantir a label no metadados pra facilitar a vida, ao invés de usar o nome do arquivo (acho que não precisa) with open(self.params['general']['feature_extraction_filelist']) as f: files = f.read().splitlines() # todo: usar um multiprocessing.manager pra realizar o compatilhamento do buffer (ao invés de fazer por chunks, como abaixo) metas = copy.copy(files) files = [] for i in metas: files.append(i.split("\t")[0]) metas = [] num_files = len(files) output_buffer_size = self.params['feature_extraction']['output_buffer_size'] pool = Pool(processes=self.params['feature_extraction']['worker_extractors']) for i in range(0, num_files, output_buffer_size): print "processing files %d through %d of %d" % (i + 1, min(i + output_buffer_size, num_files), num_files) result = pool.map(self.extract, files[i:min(i + output_buffer_size, num_files)]) T0 = time.time() for track in result: filename = acf_utils.extract_filename(track.metadata.filename, "wav") + ".features" filename = self.params['general']['scratch_directory'] + "/" + filename print "writing features to file %s..." % (filename) feature_file = open(filename, "w") track.save(feature_file) feature_file.close() del track T1 = time.time() print "writing feature files to disk took %f seconds" % (T1 - T0) del result gc.collect() pool.close() pool.join() print ('Feature extraction done!')
def test(f: Callable, inp: list, outp: list, name: str) -> list: try: with Pool() as p: out = p.map(f, inp) except: out = None status = 'OK' if out == outp else 'FAILED' print(f'{name}: {status}')
def pcall_mp(fun, args, cores=cores): """Calls a function for every input in args""" mainpool = Pool(cores) # create pool # print("Using",cores,"cores") out = mainpool.map(fun, args) # return list mainpool.terminate() del mainpool # delete pool return out
def ospf_check(): clear_log() devices = [x.split(',')[0] for x in open(devicesFile)] pool = Pool(processor) lock = Manager().Lock() list(pool.map(partial(_inf_ospf_check, lock), devices)) pool.close() pool.join()
def parse_rows(rows): with Pool(processes=32, maxtasksperchild=1000) as pool: iterator = pool.imap(parse_row, rows, chunksize=100) iterator_tracked = tqdm(iterator, desc='parsing rows', total=len(rows)) parsed_rows = list(iterator_tracked) features, labels, surfaces = list(map(list, zip(*parsed_rows))) return features, labels, surfaces
def get_issues(self, start_date=None, end_date=None): def wrapper(path): return self.__create_entries(path, start_date, end_date) with Pool() as pool: entries = reduce(lambda a, b: a + b, pool.map(wrapper, self.__paths)) return pandas.DataFrame(entries, columns=ISSUE_FIELDS)
def get_log(self, start_date=None, end_date=None): def wrapper(path): return self.__create_log_entries(path, start_date, end_date) with Pool() as pool: entries = reduce(lambda a, b: a + b, pool.map(wrapper, self.__paths)) return pandas.DataFrame(entries, columns=GIT_COMMIT_FIELDS + ["repository"])
def process(self): images_relative_dirpath = os.path.join("raw", self.fold, "images") image_info_list = [] coco = self.get_coco() for image_id in self.image_id_list: filename = coco.loadImgs(image_id)[0]["file_name"] annotation_ids = coco.getAnnIds(imgIds=image_id) annotation_list = coco.loadAnns(annotation_ids) image_info = { "image_id": image_id, "image_filepath": os.path.join(self.root, images_relative_dirpath, filename), "image_relative_filepath": os.path.join(images_relative_dirpath, filename), "annotation_list": annotation_list } image_info_list.append(image_info) partial_preprocess_one = partial(preprocess_one, pre_filter=self.pre_filter, pre_transform=self.pre_transform, processed_dir=self.processed_dir) with Pool(self.pool_size) as p: sample_stats_list = list( tqdm(p.imap(partial_preprocess_one, image_info_list), total=len(image_info_list))) # Aggregate sample_stats_list image_s0_list, image_s1_list, image_s2_list, class_freq_list = zip( *sample_stats_list) image_s0_array = np.stack(image_s0_list, axis=0) image_s1_array = np.stack(image_s1_list, axis=0) image_s2_array = np.stack(image_s2_list, axis=0) class_freq_array = np.stack(class_freq_list, axis=0) image_s0_total = np.sum(image_s0_array, axis=0) image_s1_total = np.sum(image_s1_array, axis=0) image_s2_total = np.sum(image_s2_array, axis=0) image_mean = image_s1_total / image_s0_total image_std = np.sqrt(image_s2_total / image_s0_total - np.power(image_mean, 2)) class_freq = np.sum(class_freq_array * image_s0_array[:, None], axis=0) / image_s0_total # Save aggregated stats self.stats = { "image_mean": image_mean, "image_std": image_std, "class_freq": class_freq, } torch.save(self.stats, self.stats_filepath) # Indicates that processing has been performed: pathlib.Path(self.processed_flag_filepath).touch()
def format_to_lines(args): # load mapping files print('| Loading mapping files ...') corpus_mapping = {"train": [], "valid": [], "test": []} for corpus_type in ['valid', 'test', 'train']: temp = [] mapping_fp = os.path.join(args.map_path, "mapping_{}.txt".format(corpus_type)) if not os.path.exists(mapping_fp): print( "Mapping file '{}' doesn't exist. Skip the type of mapping files." .format(mapping_fp)) continue for line in open(mapping_fp): temp.append(hashhex(line.strip())) temp.append(line.strip()) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} # load corresponding tokenized json files print('| Loading tokenized json files ...') train_files, valid_files, test_files = [], [], [] for f in glob.glob(os.path.join(args.raw_path, '*.json')): real_name = os.path.splitext(os.path.basename(f)) if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) # convert to target lines json file print('| Converting to line-based json files ...') corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = os.path.join(args.save_path, "{}.{}.json".format(corpus_type, p_ct)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = os.path.join(args.save_path, "{}.{}.json".format(corpus_type, p_ct)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] print('| Finish formating to lines-based json files !')
def build_clusters(self): set_list = [(k, n) for k in self.k_list for n in range(self.num_iter)] p = Pool() p.starmap(self.prepare_directory, tqdm(set_list)) p.close() self.write_dirlist() self.print_face()
def main(): base_filename = "../plots/survival/{}.pdf" survival_functions = [(sv.FractionOldNew, 'FractionNew'), (sv.OldNewSurvival, 'OldNewMix'), (sv.OldWaning, 'OldWaning')] p = Pool() p.map(run_survival_function, survival_functions)
def format_to_lines(args): if not os.path.isdir(args.map_path): os.makedirs(args.map_path) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) data_splitter = SplitRawFiles(args.raw_path, args.map_path) data_splitter.get_and_split_filenames() data_splitter.save_fnames_to_corresponding_files() corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): # temp.append(hashhex(line.strip())) temp.append(line) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] i=0 for f in glob.glob(pjoin(args.raw_path, '*.json')): # real_name = f.split('/')[-1].split('.')[0] # real_name = hashhex(f.split('/')[-1].split('.')[0]) real_name = f.split('/')[-1] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) i+=1 # if i > 100: # break corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} # import ipdb; ipdb.set_trace() for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) # import ipdb; ipdb.set_trace() with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}/{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def createZips(self): t1 = time() if __name__ == '__main__': self.get_list_of_id() # get set of string id p = Pool() p.map(self.createZip, range(self.count_zips)) p.close() p.join() print('Create .zip files time = ' + str(time() - t1) + 's')
def sleeping(arg): time.sleep(0.1) ncores = 2 pool = Pool(ncores) # sequential run %timeit list(map(sleeping, range(24))) # parallel run %timeit pool.map(sleeping, range(24)) pool.close()
def convertpool(self): if len(self.todo) > 0: if self.type in [".h264", ".mp4", ".avi"]: pool = Pool(min(self.pools, len(self.todo))) try: pool.map(self.conv_single, self.todo) pool.close() lineprint("Done converting all videofiles!", label="pirecorder") except KeyboardInterrupt: lineprint("User terminated converting pool..", label="pirecorder") self.terminated = True pool.terminate() return except Exception as e: excep = "Got exception: %r, terminating pool" % (e, ) lineprint(excep, label="pirecorder") pool.terminate() finally: pool.join() if self.delete: for filein in self.todo: os.remove(filein) lineprint("Deleted all original videofiles..", label="pirecorder") elif self.type in [".jpg", ".jpeg", ".png"]: vidname = commonpref(self.todo) lineprint("Start converting " + str(len(self.todo)) + " images", label="pirecorder") frame_array = [] for filename in self.todo: frame = cv2.imread(filename) frame_array.append(frame) #os.rename(filename, self.outdir+"/"+filename) h, w, _ = frame_array[0].shape if self.outdir != "": vidname = self.outdir + "/" + os.path.basename(vidname) vidout = videowriter(vidname, w, h, self.imgfps, self.resizeval) for i in range(len(frame_array)): vidout.write(frame_array[i]) vidout.release() lineprint("Finished converting " + os.path.basename(vidname), label="pirecorder") else: lineprint("No video or image files found..", label="pirecorder")
def multimap(function, inputs, chunked=False, processes=32, maxtasksperchild=1, chunksize=1, n_calcs=None): ''' This function is a wrapper to parallelize a function. Args: function The function you want to execute inputs An iterable that yields proper arguments to the function chunked A Boolean indicating whether your function expects single arguments or "chunked" iterables, e.g., lists. processes The number of threads/processes you want to be using maxtasksperchild The maximum number of tasks that a child process may do before terminating (and therefore clearing its memory cache to avoid memory overload). chunksize How many calculations you want to have each single processor do per task. Smaller chunks means more memory shuffling. Bigger chunks means more RAM requirements. n_calcs How many calculations you have. Only necessary for adding a percentage timer to the progress bar. Returns: outputs A list of the inputs mapped through the function ''' # Collect garbage before we begin multiprocessing to make sure we don't # pass things we don't need to gc.collect() # If we have one thread, there's no use multiprocessing if processes == 1: output = [function(input_) for input_ in tqdm(inputs, total=n_calcs)] return output with Pool(processes=processes, maxtasksperchild=maxtasksperchild) as pool: # Use multiprocessing to perform the calculations. We use imap instead # of map so that we get an iterator, which we need for tqdm (the # progress bar) to work. imap also requires less disk memory, which # can be an issue for some of our large systems. if not chunked: iterator = pool.imap(function, inputs, chunksize=chunksize) total = n_calcs outputs = list(tqdm(iterator, total=total)) # If our function expects chunks, then we have to unpack our inputs # appropriately else: iterator = pool.imap(function, _chunk(inputs, n=chunksize)) total = n_calcs / chunksize outputs = list(np.concatenate(list(tqdm(iterator, total=total)))) return outputs
def Multiprocessed_OCRPDF( source="", targetPath=None, processes=4, nice=5, verbose=False, tesseract_config='--oem 1 -l best/eng -c preserve_interword_spaces=1 textonly_pdf=1', logger=None): if isinstance(source, str): if verbose: ( logger.info if logger else print )("You passed a string in as source. Trying this as source pdf file path." ) page_count = PyPDF2.PdfFileReader(source).getNumPages() else: if verbose: (logger.info if logger else print)("OCRUSREX - Try extracting Images from bytes object") page_count = PyPDF2.PdfFileReader(io.BytesIO(source)).getNumPages() output = PyPDF2.PdfFileWriter() # set up a multiprocess pool with the specified number of processes. Then call the single-threaded OCRPDF pethod # on each page p = Pool(processes) for ocred_page in p.map( lambda p: OCRPDF(source=source, verbose=verbose, nice=nice, page=p + 1, tesseract_config=tesseract_config, logger=logger), range(0, page_count)): output.addPage(PyPDF2.PdfFileReader(io.BytesIO(ocred_page)).getPage(0)) if verbose: (logger.info if logger else print)("Multithreaded Execution Complete!") # If targetPath was provided, assume that it's a string and valid path. Try to write. if targetPath: outputStream = open(targetPath, "wb") output.write(outputStream) outputStream.close() # upon success, return truthy values (in this case, True) return True # otherwise, return results as bytes obj else: output_file_obj = io.BytesIO() output.write(output_file_obj) return output_file_obj.getvalue() if verbose: (logger.info if logger else print)( "Complete! Elapsed time: {0}".format(end - start))
def main_change(self, lock): p = Pool() list_of_process = [] for i in range(2): list_of_process.append( p.Process(target=self.change, args=(i, lock))) for i in range(2): list_of_process[i].start() for i in range(2): list_of_process[i].join()
def __init__(self, funct, data, threads='all'): raise Exception("Not functionnal yet !") self.funct = funct if threads == 'all': threads = cpu_count() self.pool = Pool(processes=threads) self.data = data self.PG = None self.initializer = None self.finalizer = None
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] if args.map_on and args.map_path != 'empty': for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(hashhex(line.strip())) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} else: tr, va, te = manual_corp_assign(args) corpus_mapping['valid'] = va corpus_mapping['test'] = te corpus_mapping['train'] = tr train_files, valid_files, test_files = [], [], [] # path = glob.glob(pjoin(args.raw_path, '*.json')) # sh hinzu # if len(path) < 1: # path = glob.glob(pjoin(os.getcwd() + '\\' + args.raw_path, '*.json')) # print(os.getcwd() + '\\' + args.raw_path) for f in glob.glob(pjoin(args.raw_path, '*.json')): # sh geändert if args.map_on and args.map_path != 'empty': real_name = f.split('\\')[-1].split('.')[0] # SH geändert real_name = f.split('/')[-1].split('.')[0] else: real_name = f if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def ensure(self, what, affected_services): '''ensures that affected_services are started or stopped with 5 attempts. what: string 'off' or 'on' affected_services: list { service_name, service_type } returns the services that still did not do what was requested''' tries = 4 wait = [8, 5, 3, 2, 1] if None in [self.username, self.password, self.server]: log.warn( 'Required environmental variables for connecting to ArcGIS Server do not exist. ' + 'No services will be stopped or started. See README.md for more details.' ) return (True, None) def act_on_service(service_info): #: logs within this context do not show up in the console or log file service_name, service_type = service_info if what == 'off': status, message = self.turn_off(service_name, service_type) else: status, message = self.turn_on(service_name, service_type) if not status: return (service_name, service_type) return None def get_service_names(services): return ', '.join( [name + '.' + service for name, service in affected_services]) while len(affected_services) > 0 and tries >= 0: sleep(wait[tries]) tries -= 1 num_processes = environ.get('FORKLIFT_POOL_PROCESSES') swimmers = num_processes or config.default_num_processes if swimmers > len(affected_services): swimmers = len(affected_services) with Pool(swimmers) as pool: log.debug('affected services: %s', get_service_names(affected_services)) affected_services = [ service for service in pool.map(act_on_service, affected_services) if service is not None ] if len(affected_services) > 0: log.debug('retrying %s', get_service_names(affected_services)) return (len(affected_services) == 0, get_service_names(affected_services))
def format_to_lines(args): # corpus_mapping = {} # for corpus_type in ['valid', 'test', 'train']: # temp = [] # for line in open(pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): # temp.append(hashhex(line.strip())) # corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} train_files, valid_files, test_files = [], [], [] # 随机划分数据集,train:valid:test = 8:1:1 import random random.seed(1) for f in glob.glob(pjoin(args.raw_path, '*.json')): # real_name = f.split('/')[-1].split('.')[0] # if (real_name in corpus_mapping['valid']): # valid_files.append(f) # elif (real_name in corpus_mapping['test']): # test_files.append(f) # elif (real_name in corpus_mapping['train']): # train_files.append(f) n = random.random() if n <= 0.1: valid_files.append(f) elif n <= 0.2: test_files.append(f) else: train_files.append(f) corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def get_gameplays(): PlayTypeDict = {} PlayTypeStrings = { 'Pass': ['pass incomplete', 'pass complete', 'sacked'], 'Admin': ['spiked the ball', 'Timeout', 'Penalty', 'aborted'], 'Kneel': ['knee', 'knelt'], 'Punt': ['Punts'], 'Field Goal': ['field goal', 'no good'], 'Special Teams': ['kicks off', 'kicks onside', 'extra point', 'two point'], 'Run': [ 'left end', 'right end', ' for ', 'up the middle', 'middle for', 'left tackle', 'left guard', 'right guard', 'right tackle' ], } YearStart = 1998 YearsToGo = 20 for Year in range(YearStart, YearStart + YearsToGo): PlayTypeCounts = { 'Pass': 0, 'Run': 0, 'Punt': 0, 'Field Goal': 0, 'Admin': 0, 'Kneel': 0, 'Special Teams': 0 } for GameNumber in range(1, 17): print('Game', GameNumber, 'in', Year, 'Time: ', datetime.now()) PlayTypeDict = {} PathList = [] for Team in TeamLookup: for GameLocation in ['H', 'A']: path = 'https://widgets.sports-reference.com/wg.fcgi?css=1&site=pfr&url=%2Fplay-index%2Fplay_finder.cgi%3Frequest%3D1%26match%3Dall%26year_min%3D{YEAR}%26year_max%3D{YEAR}%26game_type%3DR%26game_num_min%3D{GameNumber}%26game_num_max%3D{GameNumber}%26week_num_min%3D0%26week_num_max%3D99%26game_location%3D{GameLocation}%26minutes_max%3D15%26seconds_max%3D0%26minutes_min%3D0%26seconds_min%3D0%26team_id%3D{TEAM}%26field_pos_min_field%3Dteam%26field_pos_max_field%3Dteam%26end_field_pos_min_field%3Dteam%26end_field_pos_max_field%3Dteam%26type%255B%255D%3DPASS%26type%255B%255D%3DRUSH%26type%255B%255D%3DPUNT%26type%255B%255D%3DKOFF%26type%255B%255D%3DONSD%26type%255B%255D%3DFG%26type%255B%255D%3DXP%26type%255B%255D%3D2PC%26no_play%3DN%26turnover_type%255B%255D%3Dinterception%26turnover_type%255B%255D%3Dfumble%26score_type%255B%255D%3Dtouchdown%26score_type%255B%255D%3Dfield_goal%26score_type%255B%255D%3Dsafety%26order_by%3Dyds_to_go&div=div_all_plays&del_col=1,11,12,13,14'.format( YEAR=Year, GameNumber=GameNumber, TEAM=Team, GameLocation=GameLocation) PathList.append(path) #req = get(path) p = Pool(8) # Pool tells how many at a time records = p.map(GetAndParsePath, PathList) p.terminate() p.join() with open( 'output/PlayTypeCounts-Year-' + str(Year) + '-Game-' + str(GameNumber) + '.json', 'w') as outfile: json.dump(PlayTypeDict, outfile)
def format_to_lines_tfds(args): """ Formats source text and target text as pt file. """ tokenized_sub_dirs = os.listdir(args.raw_path) dataset_name = os.path.dirname(args.save_path).split('/')[-1] # Make directory if not os.path.isdir(args.save_path): os.makedirs(args.save_path) # Create file list for each split directory corpora = {} for tokenized_sub_dir in tokenized_sub_dirs: path = pjoin(args.raw_path, tokenized_sub_dir) files = [] for f in glob.glob(pjoin(path, '*.json')): files.append(f) corpora[tokenized_sub_dir] = files files = [] for corpus_type in tokenized_sub_dirs: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) # NOTE: save files according to shard_size if (len(dataset) >= args.shard_size): if (corpus_type == 'validation'): type_name = 'valid' else: type_name = corpus_type pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name, p_ct) with open(pjoin(args.save_path, pt_file), 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() # For the last few data (< shard size) if (len(dataset) > 0): if (corpus_type == 'validation'): type_name = 'valid' else: type_name = corpus_type pt_file = "{:s}.{:s}.{:d}.json".format(dataset_name, type_name, p_ct) with open(pjoin(args.save_path, pt_file), 'w') as save: save.write(json.dumps(dataset)) p_ct += 1 dataset = []
def convert_to_shard_data(post_dir, shard_dir, args): shard_count = 0 corpora = sorted([ os.path.join(post_dir, f) for f in os.listdir(post_dir) if not f.startswith('.') and not f.endswith('.abs.txt.json') ]) args_list = [] for f_main in corpora: f_abs_name = '{}.abs.txt.json'.format( os.path.basename(f_main).split('.')[0]) f_abs = os.path.join(post_dir, f_abs_name) args_list.append((f_main, f_abs, args)) start = time.time() print('... (4) Packing tokenized data into shards...') print('Converting files count: {}'.format(len(corpora))) shard_count = 0 dataset = [] t_len = math.ceil(len(corpora) / args.shard_size) # imap executes in sync multiprocess manner # use array and shard_size to save the flow of ordered data with Pool(args.n_cpus) as pool: with tqdm(total=t_len) as pbar: with tqdm(total=args.shard_size) as spbar: for i, data in enumerate(pool.imap(format_to_lines, args_list)): dataset.append(data) spbar.update() if i != 0 and i % args.shard_size == 0: fpath = os.path.join( shard_dir, 'shard.{}.json'.format(shard_count)) with open(fpath, 'w') as f: f.write(json.dumps(dataset)) dataset = [] shard_count += 1 pbar.update() spbar.reset() # gc.collect() spbar.close() pbar.close() if len(dataset) > 0: fpath = os.path.join(shard_dir, 'shard.{}.json'.format(shard_count)) print('last shard {} saved'.format(shard_count)) with open(fpath, 'w') as f: f.write(json.dumps(dataset)) dataset = [] shard_count += 1 end = time.time() print('... Ending (4), time elapsed {}'.format(end - start))
def calculate(self, data): """ run graph calculations """ # make sure data is valid when using schema if self._schema: try: import jsonschema except ImportError: msg = 'jsonschema package is needed for validating data' raise ImportError(msg) jsonschema.validate(instance=data, schema=self._schema) t1 = dt.datetime.utcnow() LOGGER.info('Starting calculation...') self._data = Data(data) self._data.check_inputs(self.sim_inputs, self.sim_outputs) if not self._sorted_dep: self._topological_sort() for items in self._sorted_dep: # loading node with inputs for item in items: node = self._get_node(item) inputs = [i for i in node.inputs_without_constants] for inp in inputs: node.set_value_to_input(inp.name, self._data[inp.map]) # running nodes if self._parallel: try: from multiprocess import Pool except ImportError: msg = 'multiprocess package is needed for parralelism' raise ImportError(msg) pool = Pool(self._pool_size) results = pool.map(Graph.run_node, [self._get_node(i) for i in items]) pool.close() pool.join() results = {k: v for k, v in results} else: results = {} for item in items: node = self._get_node(item) res = node.run_with_loaded_inputs() results[node.id] = res # save results for item in items: node = self._get_node(item) res = results[node.id] if len(node.outputs) == 1: self._data[node.outputs[0].map] = res else: for i, out in enumerate(node.outputs): self._data[out.map] = res[i] t2 = dt.datetime.utcnow() LOGGER.info('Calculation finished in {}'.format(t2 - t1)) return res