示例#1
0
    def test_divide_slice(self):
        subset = range(51, 100)

        sub_size = len(subset)
        divide_sub = divide_slices(sub_size, 3, subset.start)

        self.assertEqual(3, len(divide_sub))
示例#2
0
def parallel_word_count(corpus_file,
                        output_file,
                        lemmatize,
                        max_rows=None,
                        n_processes=8):
    # get data slices
    input_hdf5 = h5py.File(corpus_file, 'r')
    dataset_name = "sentences"
    dataset = input_hdf5[dataset_name]
    nrows = len(dataset)
    input_hdf5.close()

    if max_rows != -1 and max_rows < nrows:
        nrows = max_rows

    data_slices = divide_slices(nrows, n_processes, 0)

    args = [(corpus_file, data_slice, lemmatize) for data_slice in data_slices]
    pool = Pool(n_processes)

    result = pool.map(func=word_frequencies, iterable=args)
    pool.close()

    # agglomerate results
    freq = Counter()
    for freq_i in result:
        freq = freq + freq_i
    freq = freq.most_common()
    print("{0} unique words".format(len(freq)))
    for i in range(10):
        (w, f) = freq[i]
        print("{0}:{1}".format(w, f))

    if output_file is not None:
        output_hdf5 = h5py.File(output_file, 'w')
        word_ids = range(len(freq))

        # encode explicitly so that hdf5 can take an array of variable length strings and store it
        # the hdf5 needs to store variable-length strings with a specific encoding (UTF-8 in this case)
        vocabulary = np.array(
            [freq[i][0].encode("utf8") for i in range(len(freq))])

        dt = h5py.special_dtype(vlen=str)
        output_hdf5.create_dataset("vocabulary",
                                   data=vocabulary,
                                   dtype=dt,
                                   compression="gzip")
        print("vocabulary written")

        freq = np.array([freq[i][1] for i in range(len(freq))])
        output_hdf5.create_dataset("frequencies",
                                   data=freq,
                                   compression="gzip")
        print("frequencies written")

        output_hdf5.close()
        print("done")
def parallel_process_corpus():
    n_workers = 4
    pool = Pool(n_workers)
    dataset_slices = divide_slices(max_sentences, n_workers)
    t0 = time.time()
    pool.map(func=do_work, iterable=dataset_slices)
    pool.close()
    pool.join()
    t1 = time.time()
    time.sleep(1)
    print("Done: {0:.2f} secs ".format(t1 - t0))
示例#4
0
    def test_slice(self):
        v = range(0, 16, 1)
        num_splits = 5

        slices = split.divide_slices(len(v), num_splits)
        self.assertEqual(len(slices), num_splits)

        print(slices)
        for r in slices:
            for elem in r:
                print(elem)
            print("-------")
示例#5
0
def parallel_ri(corpus_file, max_rows=None, window_size=3, n_processes=8):
    # get data slices
    input_hdf5 = h5py.File(corpus_file, 'r')
    dataset_name = "sentences"
    dataset = input_hdf5[dataset_name]
    nrows = len(dataset)
    input_hdf5.close()

    if max_rows is not None and max_rows < nrows:
        nrows = max_rows

    data_slices = divide_slices(max_rows, n_processes, 0)

    args = [(corpus_file, data_slice, window_size)
            for data_slice in data_slices]

    # share a global lock to avoid messing sign index on lookups
    # l = Lock()
    # pool = Pool(initializer=init_lock(l), initargs=(l,), processes=n_processes)
    pool = Pool(processes=n_processes)

    result = pool.map(func=text_to_ri, iterable=args)
    pool.close()
示例#6
0
# ======================================================================================
# Master Node
# ======================================================================================
if comm.rank == 0:

    # open hdf5 file and get the dataset
    corpus_hdf5 = h5py.File(corpus_file, 'r')
    corpus_dataset = corpus_hdf5["sentences"]

    if num_rows == -1:
        num_rows = len(corpus_dataset)

    print("Master Node: preparing data, processing [ %d of %d ]" %
          (num_rows, len(corpus_dataset)))

    subset_slices = divide_slices(n=num_rows, n_slices=num_slaves)

    print("Sending Tasks to %d nodes" % (num_slaves))
    # send slices
    for node in range(1, size):
        slice_i = node - 1
        comm.send(subset_slices[slice_i], dest=node)

    print("Data Delivered")
    num_finished = 0

    while num_finished < num_slaves:
        print("Waiting for results...")
        status = MPI.Status()
        data = comm.recv(source=MPI.ANY_SOURCE, status=status)
        tag = status.Get_tag()