예제 #1
0
    def test_contains_id(self):
        dim = 100
        act = 10

        gen = Generator(dim, act)
        sign_index = SignIndex(gen)

        id = 0
        self.assertFalse(sign_index.contains_id(id))
예제 #2
0
    def test_get(self):
        dim = 100
        act = 10

        gen = Generator(dim, act)
        sign_index = SignIndex(gen)

        sign_index.add("0")
        ri0 = sign_index.get_ri("0")
        self.assertIsInstance(ri0, RandomIndex)

        self.assertEqual(ri0.dim, dim)
예제 #3
0
    def test_contains(self):
        dim = 100
        act = 10

        gen = Generator(dim, act)
        sign_index = SignIndex(generator=gen)

        sign_index.add("0")
        self.assertTrue(sign_index.contains("0"))
        self.assertFalse(sign_index.contains("1"))

        sign_index.remove("0")
        self.assertFalse(sign_index.contains("0"))
예제 #4
0
class MyTestCase(unittest.TestCase):
    def setUp(self):
        dim = 10
        act = 4
        self.generator = Generator(dim=dim, num_active=act)
        self.sign_index = SignIndex(self.generator)

    def test_encode_sp_create(self):
        sentence = ["A", "B"]

        for word in sentence:
            self.sign_index.add(word)

        ris = []
        for word in sentence:
            ri = self.sign_index.get_ri(word)
            ris.append(ri)

        result = ris_to_sp_tensor_value(ris, self.sign_index.feature_dim())
        print(result)

    def test_encode_sp_positive(self):
        """
        Testing encoding for positive-only sparse random vectors

        """
        sentence = ["A", "B"]

        for word in sentence:
            self.sign_index.add(word)

        ris = []
        for word in sentence:
            ri = self.sign_index.get_ri(word)
            ris.append(ri)

        result = ris_to_sp_tensor_value(ris, self.sign_index.feature_dim(), all_positive=True)
        print(result)
예제 #5
0
 def setUp(self):
     dim = 10
     act = 4
     self.generator = Generator(dim=dim, num_active=act)
     self.sign_index = SignIndex(self.generator)
예제 #6
0
 def setUp(self):
     dim = 10
     act = 2
     self.generator = Generator(dim=dim, num_active=act)
     self.sign_index = SignIndex(self.generator)
     self.perm_generator = PermutationGenerator(dim=dim)
예제 #7
0
class TestEncode(unittest.TestCase):
    def setUp(self):
        dim = 10
        act = 2
        self.generator = Generator(dim=dim, num_active=act)
        self.sign_index = SignIndex(self.generator)
        self.perm_generator = PermutationGenerator(dim=dim)

    def test_bow_create(self):
        data = ["A", "B", "A", "C", "A", "B"]

        for s in data:
            self.sign_index.add(s)

        unique_str = set(data)
        self.assertEqual(len(self.sign_index), len(unique_str))

        windows = windows(data, window_size=1)
        vectors = [enc.to_bow(w, self.sign_index) for w in windows]
        self.assertEqual(len(vectors), len(windows))

    def test_bow_normalise(self):
        data = ["A", "A"]

        for s in data:
            self.sign_index.add(s)

        unique_str = set(data)
        self.assertEqual(len(self.sign_index), len(unique_str))

        windows = windows(data, window_size=1)
        norm_bow = enc.to_bow(windows[0], self.sign_index,normalise=True,include_target=True)
        self.assertEqual(np.max(norm_bow),1)


        unorm_bow = enc.to_bow(windows[0], self.sign_index, normalise=False,include_target=True)
        self.assertEqual(np.max(unorm_bow),2)


    def test_bow_ignore_order(self):
        data1 = ["A", "B"]
        data2 = ["B", "A"]

        for s1, s2 in data1, data2:
            self.sign_index.add(s1)
            self.sign_index.add(s2)

        windows1 = windows(data1, window_size=1)
        windows2 = windows(data2, window_size=1)

        v1 = enc.to_bow(windows1[0], self.sign_index)
        v2 = enc.to_bow(windows2[0], self.sign_index)

        np_test.assert_array_equal(v1, v2)
        np_test.assert_array_equal(v1, v2)

        a_ri = self.sign_index.get_ri("A")
        b_ri = self.sign_index.get_ri("B")

        np_test.assert_array_equal(v1 - a_ri.to_vector(),
                                   b_ri.to_vector())

    def test_bow_dir_create(self):
        data1 = ["A", "B", "C"]
        data2 = ["A", "C", "B"]

        for i in range(len(data1)):
            self.sign_index.add(data1[i])
            self.sign_index.add(data2[i])

        w1 = windows(data1, window_size=2)
        w2 = windows(data2, window_size=2)

        perm = self.perm_generator.matrix()
        v1 = enc.to_bow_dir(w1[0], sign_index=self.sign_index, perm_matrix=perm)
        v2 = enc.to_bow_dir(w2[0], sign_index=self.sign_index, perm_matrix=perm)

        self.assertSetEqual(set(w1[0].right), set(w2[0].right))
        np_test.assert_array_equal(v1, v2)
예제 #8
0
    result_path = home + "/data/results/"
    corpus_file = home + corpus_file

    print("Reading hdf5 dataset from: ", corpus_file)
    dataset_name = "sentences_lemmatised"

    # open hdf5 file and get the dataset
    h5f = h5py.File(corpus_file, 'r')
    dataset = h5f[dataset_name]
    return dataset

# do something with the dataset

# Create Sign RI Index
ri_gen = Generator(dim=ri_dim, num_active=ri_num_active)
sign_index = SignIndex(ri_gen)

max_sentences = 200000


def load_spacy():
    t0 = time.time()
    # load tokenizer only
    nlp = English(entity=False, load_vectors=False, parser=True, tagger=True)
    t1 = time.time()
    print("Done: {0:.2f} secs ".format(t1 - t0))
    return nlp

nlp = load_spacy()

예제 #9
0
    def test_size(self):
        gen = Generator(100, 10)
        sign_index = SignIndex(generator=gen)

        # adding elements should increase size
        self.assertEqual(len(sign_index), 0)

        sign_index.add("0")
        self.assertEqual(len(sign_index), 1)
        self.assertEqual(sign_index.nextID, sign_index.get_id("0") + 1)

        # duplicated elements are not added
        sign_index.add("0")
        self.assertEqual(len(sign_index), 1)

        sign_index.add("1")
        self.assertEqual(len(sign_index), 2)

        # removing elements should reduce size
        size_before = len(sign_index)

        sign_index.remove("0")
        size_after = len(sign_index)
        self.assertEqual(size_after, size_before - 1)
예제 #10
0
from deepsign.data.corpora.pipe import BNCPipe
from deepsign.rp.encode import to_bow
from deepsign.rp.index import SignIndex, Generator
from deepsign.data.iterators import chunk_it, windows

home = os.getenv("HOME")

data_dir = home + "/data/gold_standards/"
corpus_file = data_dir + "bnc.hdf5"

corpus_hdf5 = h5py.File(corpus_file, 'r')
corpus_dataset = corpus_hdf5["sentences"]

n_rows = 1000
sentences = chunk_it(corpus_dataset, n_rows=n_rows, chunk_size=100000)
pipeline = BNCPipe(datagen=sentences, lemmas=True)

ri_gen = Generator(1000, 10)
index = SignIndex(ri_gen)

for s in tqdm(pipeline, total=n_rows):
    index.add_all(s)

    windows = windows(s, window_size=2)

    for window in windows:
        pass
        #words = window.left + window.right
        #ris = [index.get_ri(word).to_vector() for word in words]
        bow = to_bow(window, index, include_target=False, normalise=True)