def test_divide_slice(self): subset = range(51, 100) sub_size = len(subset) divide_sub = divide_slices(sub_size, 3, subset.start) self.assertEqual(3, len(divide_sub))
def parallel_word_count(corpus_file, output_file, lemmatize, max_rows=None, n_processes=8): # get data slices input_hdf5 = h5py.File(corpus_file, 'r') dataset_name = "sentences" dataset = input_hdf5[dataset_name] nrows = len(dataset) input_hdf5.close() if max_rows != -1 and max_rows < nrows: nrows = max_rows data_slices = divide_slices(nrows, n_processes, 0) args = [(corpus_file, data_slice, lemmatize) for data_slice in data_slices] pool = Pool(n_processes) result = pool.map(func=word_frequencies, iterable=args) pool.close() # agglomerate results freq = Counter() for freq_i in result: freq = freq + freq_i freq = freq.most_common() print("{0} unique words".format(len(freq))) for i in range(10): (w, f) = freq[i] print("{0}:{1}".format(w, f)) if output_file is not None: output_hdf5 = h5py.File(output_file, 'w') word_ids = range(len(freq)) # encode explicitly so that hdf5 can take an array of variable length strings and store it # the hdf5 needs to store variable-length strings with a specific encoding (UTF-8 in this case) vocabulary = np.array( [freq[i][0].encode("utf8") for i in range(len(freq))]) dt = h5py.special_dtype(vlen=str) output_hdf5.create_dataset("vocabulary", data=vocabulary, dtype=dt, compression="gzip") print("vocabulary written") freq = np.array([freq[i][1] for i in range(len(freq))]) output_hdf5.create_dataset("frequencies", data=freq, compression="gzip") print("frequencies written") output_hdf5.close() print("done")
def parallel_process_corpus(): n_workers = 4 pool = Pool(n_workers) dataset_slices = divide_slices(max_sentences, n_workers) t0 = time.time() pool.map(func=do_work, iterable=dataset_slices) pool.close() pool.join() t1 = time.time() time.sleep(1) print("Done: {0:.2f} secs ".format(t1 - t0))
def test_slice(self): v = range(0, 16, 1) num_splits = 5 slices = split.divide_slices(len(v), num_splits) self.assertEqual(len(slices), num_splits) print(slices) for r in slices: for elem in r: print(elem) print("-------")
def parallel_ri(corpus_file, max_rows=None, window_size=3, n_processes=8): # get data slices input_hdf5 = h5py.File(corpus_file, 'r') dataset_name = "sentences" dataset = input_hdf5[dataset_name] nrows = len(dataset) input_hdf5.close() if max_rows is not None and max_rows < nrows: nrows = max_rows data_slices = divide_slices(max_rows, n_processes, 0) args = [(corpus_file, data_slice, window_size) for data_slice in data_slices] # share a global lock to avoid messing sign index on lookups # l = Lock() # pool = Pool(initializer=init_lock(l), initargs=(l,), processes=n_processes) pool = Pool(processes=n_processes) result = pool.map(func=text_to_ri, iterable=args) pool.close()
# ====================================================================================== # Master Node # ====================================================================================== if comm.rank == 0: # open hdf5 file and get the dataset corpus_hdf5 = h5py.File(corpus_file, 'r') corpus_dataset = corpus_hdf5["sentences"] if num_rows == -1: num_rows = len(corpus_dataset) print("Master Node: preparing data, processing [ %d of %d ]" % (num_rows, len(corpus_dataset))) subset_slices = divide_slices(n=num_rows, n_slices=num_slaves) print("Sending Tasks to %d nodes" % (num_slaves)) # send slices for node in range(1, size): slice_i = node - 1 comm.send(subset_slices[slice_i], dest=node) print("Data Delivered") num_finished = 0 while num_finished < num_slaves: print("Waiting for results...") status = MPI.Status() data = comm.recv(source=MPI.ANY_SOURCE, status=status) tag = status.Get_tag()