示例#1
0
    def test_bow_normalise(self):
        data = ["A", "A"]

        for s in data:
            self.sign_index.add(s)

        unique_str = set(data)
        self.assertEqual(len(self.sign_index), len(unique_str))

        windows = windows(data, window_size=1)
        norm_bow = enc.to_bow(windows[0], self.sign_index,normalise=True,include_target=True)
        self.assertEqual(np.max(norm_bow),1)


        unorm_bow = enc.to_bow(windows[0], self.sign_index, normalise=False,include_target=True)
        self.assertEqual(np.max(unorm_bow),2)
示例#2
0
    def test_bow_create(self):
        data = ["A", "B", "A", "C", "A", "B"]

        for s in data:
            self.sign_index.add(s)

        unique_str = set(data)
        self.assertEqual(len(self.sign_index), len(unique_str))

        windows = windows(data, window_size=1)
        vectors = [enc.to_bow(w, self.sign_index) for w in windows]
        self.assertEqual(len(vectors), len(windows))
示例#3
0
    def test_bow_ignore_order(self):
        data1 = ["A", "B"]
        data2 = ["B", "A"]

        for s1, s2 in data1, data2:
            self.sign_index.add(s1)
            self.sign_index.add(s2)

        windows1 = windows(data1, window_size=1)
        windows2 = windows(data2, window_size=1)

        v1 = enc.to_bow(windows1[0], self.sign_index)
        v2 = enc.to_bow(windows2[0], self.sign_index)

        np_test.assert_array_equal(v1, v2)
        np_test.assert_array_equal(v1, v2)

        a_ri = self.sign_index.get_ri("A")
        b_ri = self.sign_index.get_ri("B")

        np_test.assert_array_equal(v1 - a_ri.to_vector(),
                                   b_ri.to_vector())
示例#4
0
def text_to_ri(args):
    (fname, data_slice, window_size) = args

    input_hdf5 = h5py.File(fname, 'r')
    dataset_name = "sentences"
    dataset = input_hdf5[dataset_name]
    gen = subset_chunk_it(dataset, data_slice, chunk_size=250)

    pbar = tqdm(total=len(data_slice))

    tokenizer = Tokenizer()
    pipe = WaCKyPipe(gen, tokenizer, filter_stop=False)

    global sign_index
    ri_vectors = dict()

    for tokens in pipe:
        # get sliding windows of given size
        s_windows = windows(tokens, window_size)

        # encode each window as a bag-of-words and add to occurrencies
        for window in s_windows:
            # pbar.write(str(window))
            # lock.acquire()
            bow_vector = to_bow(window, sign_index)
            # lock.release()
            bow_vector = np_to_sparse(bow_vector)
            sign_id = sign_index.get_id(window.target)

            if sign_id not in ri_vectors:
                ri_vectors[sign_id] = bow_vector
            else:
                current_vector = ri_vectors[sign_id]
                ri_vectors[sign_id] = bow_vector + current_vector

        pbar.update(1)

    return ri_vectors
示例#5
0
    if subsampling:
        windows_stream = (windows(list(filter(keep_token, tokens)), window_size) for tokens in pipeline)
    else:
        windows_stream = (windows(tokens, window_size) for tokens in pipeline)

    i = 0
    x_samples = []
    c_samples = []

    for windows in tqdm(windows_stream, total=n_rows):
        if len(windows) > 0:
            # list of (target,ctx)
            for window in windows:
                target = sign_index.get_ri(window.target).to_vector()
                ctx = to_bow(window, sign_index, include_target=False, normalise=True)

                x_samples.append(target)
                c_samples.append(ctx)

        i += 1

        # batch size in number of sentences
        if i % batch_size == 0 and len(c_samples) > 0:
            # feed data to the model
            x = np.asmatrix(x_samples)
            y = np.asmatrix(c_samples)
            yp = y.copy()
            yp[yp < 0] = 0
            yn = y.copy()
            yn[yn > 0] = 0
示例#6
0
        i = 0
        x_samples = []
        y_samples = []

        # restart sentence iterator
        sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=10000)
        pipeline.reaload(sentences)
        window_stream = get_window_stream(pipeline)

        for windows in tqdm(window_stream, total=n_rows):
            if len(windows) > 0:
                # list of (target,ctx)
                for window in windows:
                    word_t = window.target
                    ctx_ri = to_bow(window,
                                    index,
                                    include_target=False,
                                    normalise=True)
                    target_vector = index.get_ri(word_t).to_dist_vector()

                    x_samples.append(ctx_ri)
                    y_samples.append(target_vector)

            i += 1

            # batch size in number of sentences
            if i % batch_size == 0 and len(y_samples) > 0:
                # print current loss
                if i % 1000 == 0:
                    current_loss = sess.run(loss, {
                        model.input(): x_samples,
                        labels(): y_samples,