예제 #1
0
 def test_t1(self):
     # logger.info("Running Tests4Dataset1test1/test_t1")
     ds = Dataset(TESTFILE1)
     metafile = ds.metafile
     # print("debug: metafile=", metafile, file=sys.stderr)
     assert Dataset._modified4meta(
         metafile, name_part="train.val").endswith("train.val.json")
예제 #2
0
 def test_t9(self):
     # logger.info("Running Tests4Dataset1test1/test_t9")
     ds1 = Dataset(TESTFILE4, reuse_files=False, targets_need_padding=False)
     ds1.target.set_as_onehot(True)
     batch_reshape = ds1.batches_converted(train=False,
                                           batch_size=4,
                                           reshape=True,
                                           convert=True)
     b1r = next(iter(batch_reshape))
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!DEBUG: the whole batch:\n",b1r,file=sys.stderr)
     targets = b1r[1]
     indeps = b1r[0]
     # ok, we want one element for each example in the batch
     assert len(targets) == 4
     # get the length of the first sequence from indep by looking at the number of elements
     # of the first feature of the first instance. Note that all sequences of feature values
     # and targets should be padded to the maximum sequence length!
     feature1 = indeps[0]
     len1 = len(feature1[0])
     # check the length of the target sequences
     assert len(targets[0]) == len1
     assert len(targets[1]) == len1
     assert len(targets[2]) == len1
     assert len(targets[3]) == len1
     # TODO: check why this is supposed to be one hot vectors and not indices here!!
     # for each target check that all entries are one-hot vectors of the same length
     for i in range(4):
         for j in range(len1):
             val = targets[i][j]
             assert isinstance(val, list)
             # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!DEBUG: len val ",len(val), "val=",val, file=sys.stderr)
             assert len(val) == 17
예제 #3
0
 def test_reshape_class1(self):
     # 3 instances, each having 2 features
     batch1 = [[[11, 12], 1], [[21, 22], 2], [[31, 32], 3]]
     batch1_reshape = Dataset.reshape_batch_helper(batch1,
                                                   n_features=2,
                                                   is_sequence=False)
     # print("DEBUG: reshape_class_1: batch1_reshape=", batch1_reshape, file=sys.stderr)
     assert batch1_reshape == ([[11, 21, 31], [12, 22, 32]], [1, 2, 3])
     batch1_reshape_np = Dataset.reshape_batch_helper(batch1,
                                                      as_numpy=True,
                                                      n_features=2,
                                                      is_sequence=False)
예제 #4
0
 def test_t4(self):
     # logger.info("Running Tests4Dataset1test1/test_t4")
     ds = Dataset(TESTFILE3)
     logger.debug("TESTFILE3 attrs=%r",
                  ds.meta.get("featureInfo").get("attributes"))
     # Features constructor finishes the vocab, so we need to re-initilize
     features = ds.features
     logger.debug("TESTFILE3 features=%r", features)
     it1 = iter(ds.instances_original())
     rec = next(it1)
     logger.debug("TESTFILE3 rec1=%r", rec)
     logger.debug("TESTFILE3 info=%r" % ds.get_info())
예제 #5
0
    def test_reshape_seq1(self):
        # test reshaping sequence learning batches
        # a simple tiny batch of 3 sequences of feature vectors, each having 2 features and a target
        # the sequence lengths are 1,4,2
        batch1 = [[[[111, 112]], [-11]],
                  [[[211, 212], [221, 222], [231, 232], [241, 242]],
                   [-21, -22, -23, -23]], [[[311, 312], [321, 322]],
                                           [-31, -32]]]
        # print("DEBUG: reshape_seq1: batch1=", batch1, file=sys.stderr)
        batch1_reshape = Dataset.reshape_batch_helper(
            batch1, feature_types=["index", "index"], is_sequence=True)
        # print("DEBUG: reshape_seq1: batch1_reshape=\n", batch1_reshape, file=sys.stderr)
        # print("DEBUG: expected: \n",
        #       [
        #           [[111, 0, 0, 0], [211, 221, 231, 241], [311, 321, 0, 0]],
        #           [[112, 0, 0, 0], [212, 222, 232, 242], [312, 322, 0, 0]]
        #       ],
        #       [
        #           [-11, -1, -1, -1], [-21, -22, -23, -23], [-31, -32, -1, -1]
        #       ], file=sys.stderr)

        assert batch1_reshape == ([[[111, 0, 0, 0], [211, 221, 231, 241],
                                    [311, 321, 0, 0]],
                                   [[112, 0, 0, 0], [212, 222, 232, 242],
                                    [312, 322, 0, 0]]], [[-11, -1, -1, -1],
                                                         [-21, -22, -23, -23],
                                                         [-31, -32, -1, -1]])
예제 #6
0
 def test_t2(self):
     # logger.info("Running Tests4Dataset1test1/test_t2")
     ds = Dataset(TESTFILE1)
     features = ds.features
     s = features.size()
     assert s == 34
     it1 = iter(ds.instances_converted(train=False, convert=True))
     rec = next(it1)
     logger.debug("TESTFILE1 info=%r" % ds.get_info())
     logger.debug("TESTFILE1 rec1=%r" % rec)
     # we expect rec to be a pair: indep and dep
     indep, dep = rec
     # print("DEBUG: rec=", rec, file=sys.stderr)
     # the indep part has as many values as there are features here
     assert len(indep) == 34
     # the dep part is the encoding for two nominal classes,
     assert dep == 0
예제 #7
0
 def test_reshape_class1a(self):
     # 3 instances, each having 2 features, this time indep only
     batch1 = [
         [11, 12],
         [21, 22],
         [31, 32],
     ]
     batch1_reshape = Dataset.reshape_batch_helper(batch1,
                                                   n_features=2,
                                                   is_sequence=False,
                                                   indep_only=True)
     # print("DEBUG: reshape_class_1a: batch1_reshape=", batch1_reshape, file=sys.stderr)
     assert batch1_reshape == [[11, 21, 31], [12, 22, 32]]
     batch1_reshape_np = Dataset.reshape_batch_helper(batch1,
                                                      as_numpy=True,
                                                      n_features=2,
                                                      is_sequence=False,
                                                      indep_only=True)
예제 #8
0
 def test_features1_scaling1(self):
     ds = Dataset(TESTFILE1)
     feature3 = ds.features[3]
     val = feature3(0.5)
     assert val == 0.5
     # print("DEBUG: none / val for 0.5=%s" % val, file=sys.stderr)
     val = feature3(0.5, normalize="minmax")
     assert val == 0.75
     # print("DEBUG: minmax / val for 0.5=%s" % val, file=sys.stderr)
     val = feature3(0.5, normalize="meanvar")
     assert val > 2.3381 and val < 2.3382
예제 #9
0
 def test_reshape_class2(self):
     # 3 instances, each having 2 features, both features are sequences
     # feature value numbers indicate: instance, feature number, sequence element
     # Sequence lengths of first feature: 3, 2, 1,
     # Sequence lengths of second feature: 1, 4, 2
     batch1 = [[[[111, 112, 113], [121]], 1],
               [[[211, 212], [221, 222, 223, 224]], 2],
               [[[311], [321, 322]], 3]]
     batch1_reshape = Dataset.reshape_batch_helper(batch1,
                                                   n_features=2,
                                                   is_sequence=False)
     # print("DEBUG: reshape_class_2: batch1_reshape=", batch1_reshape, file=sys.stderr)
     assert batch1_reshape == ([[[111, 112, 113], [211, 212, 0],
                                 [311, 0, 0]],
                                [[121, 0, 0, 0], [221, 222, 223, 224],
                                 [321, 322, 0, 0]]], [1, 2, 3])
     batch1_reshape_np = Dataset.reshape_batch_helper(batch1,
                                                      as_numpy=True,
                                                      n_features=2,
                                                      is_sequence=False)
예제 #10
0
 def test_t1_1(self):
     # logger.info("Running Tests4Dataset1test1/test_t1_1")
     # test overriding the meta settings: override the "token" embeddings to have emb_dims=123
     # and emb_train=mapping and an emb_file
     ds = Dataset(TESTFILE3, config={"embs": "token:123:yes"})
     feats = ds.features
     # get the vocab of the first feature
     vocf1 = feats[0].vocab
     # print("DEBUG: emb_id=", vocf1.emb_id, "emb_dims=", vocf1.emb_dims, "emb_train=", vocf1.emb_train,
     #      "emb_file=", vocf1.emb_file, file=sys.stderr)
     assert vocf1.emb_id == "token"
     assert vocf1.emb_dims == 123
예제 #11
0
    def test_t3(self):
        # logger.info("Running Tests4Dataset1test1/test_t3")
        ds = Dataset(TESTFILE2)

        it0 = iter(ds.instances_original())
        inst0 = next(it0)
        indep, dep = inst0
        # print("DEBUG: indep=", indep, file=sys.stderr)
        assert indep == [['invincible', 'is', 'a', 'wonderful', 'movie', '.']]
        assert dep == "pos"

        # check low level conversion methods first
        inst1 = [[['invincible', 'is', 'a', 'wonderful', 'movie', '.']], 'pos']
        (indep1, dep1) = inst1

        indep1_conv = ds.features(indep1)
        logger.debug("Original  indep1=%r", indep1)
        logger.debug("Converted indep1=%r", indep1_conv)
        ngram1 = indep1_conv[0]
        assert len(ngram1) == 6
        # print("DEBUG ngram1[0]=", ngram1[0], file=sys.stderr)
        assert ngram1[0] == 3543
        assert ngram1[1] == 9
        it1 = iter(ds.instances_converted(train=False, convert=True))
        rec = next(it1)
        logger.debug("TESTFILE2 rec1=%r", rec)
        logger.debug("TESTFILE2 info=%r" % ds.get_info())
        (indep1_it, dep1_it) = rec
        ngram1_it = indep1_it[0]
        logger.debug("TESTFILE2 dep_it=%r", dep1_it)
        # print("DEBUG dep1_it=", dep1_it, file=sys.stderr)
        assert len(ngram1_it) == 6
        assert ngram1_it[0] == 3543
        assert ngram1_it[1] == 9
        assert dep1_it == 1
예제 #12
0
 def test_utils_1(self):
     l1 = [1, 2, 3]
     Dataset.pad_list_(l1, 10, pad_value=0)
     assert l1 == [1, 2, 3, 0, 0, 0, 0, 0, 0, 0]
     l1 = [1, 2, 3]
     Dataset.pad_list_(l1, 10, pad_value=0, pad_left=True)
     assert l1 == [0, 0, 0, 0, 0, 0, 0, 1, 2, 3]
     l2 = [[1, 2], [], [1, 2, 3, 4], [1], [2, 3]]
     Dataset.pad_matrix_(l2, pad_value=0)
     assert l2 == [[1, 2, 0, 0], [0, 0, 0, 0], [1, 2, 3, 4], [1, 0, 0, 0],
                   [2, 3, 0, 0]]
예제 #13
0
    def test_t5(self):
        # logger.info("Running Tests4Dataset1test1/test_t5")
        ds = Dataset(TESTFILE4)
        it1 = iter(ds.instances_converted(train=False, convert=True))
        rec = next(it1)

        indep, dep = rec
        logger.debug("TESTFILE4: indep=%r" % indep)
        logger.debug("TESTFILE4: dep=%r" % dep)
        logger.debug("TESTFILE4 info=%r" % ds.get_info())
        # the first row is a sequence of 3 elements, with 18 independent
        # features and one of 17 different targets
        # so we should convert this into 18 features which each now should have 3 values
        # and 3 onehot vectors for the class

        assert len(dep) == 3
        assert len(indep) == 3  # 3 elements in the sequence
        assert len(indep[0]) == 18
        assert len(indep[1]) == 18
        assert len(indep[2]) == 18
        # check if the class is actually ADJ for all three targets
        dep1 = dep[0]
        dep2 = dep[1]
        dep3 = dep[2]
        t11 = ds.target.vocab.idx2string(dep1)
        assert t11 == "ADJ"
        t12 = ds.target.vocab.idx2string(dep2)
        assert t12 == "ADJ"
        t13 = ds.target.vocab.idx2string(dep3)
        assert t13 == "ADJ"
        # test getting batches in non-reshaped form
        bit1 = ds.batches_converted(train=False,
                                    convert=True,
                                    batch_size=2,
                                    reshape=False)
        biter1 = iter(bit1)
        batch1 = next(biter1)
        # print("DEBUG: TESTFILE4 batch/noreshape=%s" % (batch1,), file=sys.stderr)
        assert len(batch1) == 2
        # test getting batches in reshaped form
        bit2 = ds.batches_converted(train=False,
                                    convert=True,
                                    batch_size=2,
                                    reshape=True)
        biter2 = iter(bit2)
        batch2 = next(biter2)
        # print("DEBUG: TESTFILE4 batch/noreshape=%s" % (batch1,), file=sys.stderr)
        bindep, bdep = batch2
        assert len(bindep) == 18
        assert len(bdep) == 2
        assert len(bindep[0]) == 2
예제 #14
0
    def test_t8(self):
        # logger.info("Running Tests4Dataset1test1/test_t8")
        ds = Dataset(TESTFILE3, reuse_files=True)
        # print("debug orig_train_file=", ds.orig_train_file, file=sys.stderr)
        num_idxs = ds.get_float_feature_idxs()
        # print(file=sys.stderr)
        # print("File", TESTFILE3, file=sys.stderr)
        # print("DEBUG float_idxs=", num_idxs, file=sys.stderr)
        nom_idxs = ds.get_index_feature_idxs()
        # print("DEBUG index_idxs=", nom_idxs, file=sys.stderr)
        ngr_idxs = ds.get_indexlist_feature_idxs()
        # print("DEBUG indexlist_idxs=", ngr_idxs, file=sys.stderr)

        ds = Dataset(TESTFILE4, reuse_files=True)
        # print("debug orig_train_file=", ds.orig_train_file, file=sys.stderr)
        num_idxs = ds.get_float_feature_idxs()
        # print(file=sys.stderr)
        # print("File", TESTFILE4, file=sys.stderr)
        # print("DEBUG float_idxs=", num_idxs, file=sys.stderr)
        nom_idxs = ds.get_index_feature_idxs()
        # print("DEBUG index_idxs=", nom_idxs, file=sys.stderr)
        ngr_idxs = ds.get_indexlist_feature_idxs()
        # print("DEBUG indexlist_idxs=", ngr_idxs, file=sys.stderr)

        ds = Dataset(TESTFILE2, reuse_files=True)
        # print("debug orig_train_file=", ds.orig_train_file, file=sys.stderr)
        num_idxs = ds.get_float_feature_idxs()
        # print(file=sys.stderr)
        # print("File", TESTFILE2, file=sys.stderr)
        # print("DEBUG float_idxs=", num_idxs, file=sys.stderr)
        nom_idxs = ds.get_index_feature_idxs()
        # print("DEBUG index_idxs=", nom_idxs, file=sys.stderr)
        ngr_idxs = ds.get_indexlist_feature_idxs()
        # print("DEBUG indexlist_idxs=", ngr_idxs, file=sys.stderr)

        ds = Dataset(TESTFILE1, reuse_files=True)
        # print("debug orig_train_file=", ds.orig_train_file, file=sys.stderr)
        num_idxs = ds.get_float_feature_idxs()
        # print(file=sys.stderr)
        # print("File", TESTFILE1, file=sys.stderr)
        # print("DEBUG float_idxs=", num_idxs, file=sys.stderr)
        nom_idxs = ds.get_index_feature_idxs()
        # print("DEBUG index_idxs=", nom_idxs, file=sys.stderr)
        ngr_idxs = ds.get_indexlist_feature_idxs()
예제 #15
0
 def test_t7(self):
     # logger.info("Running Tests4Dataset1test1/test_t7")
     ds = Dataset(TESTFILE3)
     ds.split(convert=True,
              keep_orig=True,
              validation_size=3,
              random_seed=1)
     # check if getting the batches and validation sets works
     valset_orig = ds.validation_set_orig()
     # print("DEBUG: valset_orig=%s" % valset_orig, file=sys.stderr)
     assert len(valset_orig) == 3
     vorigi2 = valset_orig[1]
     assert vorigi2 == [[
         'you', 'think', 'this', 'place', 'is', 'nice', 'VERB', 'DET', 'a',
         'a', 'a', 'a', 'a', 'a', '', 'nk', 'is', 'ce', '', 'ce', '', 'ink',
         '', 'ace', '', ''
     ], 'NOUN']
     valset_conv = ds.validation_set_converted()
     # print("DEBUG: valset_conv=%s" % valset_conv, file=sys.stderr)
     assert len(valset_conv) == 3
     vconvi2 = valset_conv[1]
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG: vconvi2=", vconvi2, file=sys.stderr)
     assert vconvi2 == [[
         13, 157, 25, 104, 12, 319, 2, 5, 2, 2, 2, 2, 2, 2, 0, 151, 28, 14,
         0, 14, 0, 215, 0, 101, 0, 0
     ], 0]
     valset_conv_b = ds.validation_set_converted(as_batch=True)
     # print("DEBUG: valset_conv_b=%s" % (valset_conv_b,), file=sys.stderr)
     # we expect a tuple for indep and dep
     assert len(valset_conv_b) == 2
     indep1, dep1 = valset_conv_b
     # the indep part should now have lenth equal to the number of features
     assert len(indep1) == ds.nFeatures
     # there should be 3 values for that first feature
     assert len(indep1[0]) == 3
     # get a batch of original data
     bitb1 = ds.batches_original(train=True, batch_size=4, reshape=False)
     batch_orig1 = next(iter(bitb1))
     # print("DEBUG: batch_orig1=%s" % (batch_orig1,), file=sys.stderr)
     # if reshape was False, this is just a list of instances in original format
     assert len(batch_orig1) == 4
     assert batch_orig1[1] == [[
         'Bill', 'Bradford', 'in', 'Credit', 'are', 'supposed', 'PROPN',
         'ADP', 'Aa', 'Aa', 'a', 'Aa', 'a', 'a', 'll', 'rd', '', 'it', '',
         'ed', '', 'ord', '', 'dit', '', 'sed'
     ], 'NOUN']
     bitb2 = ds.batches_original(train=True, batch_size=4, reshape=True)
     batch_orig2 = next(iter(bitb2))
     # print("DEBUG: batch_orig2=%s" % (batch_orig2,), file=sys.stderr)
     # if reshape was True, this is a tuple where the first element is the list of features
     assert len(batch_orig2) == 2
     featurelist1 = batch_orig2[0]
     feature1 = featurelist1[0]
     assert feature1[1] == 'Bill'
     bconvb1 = ds.batches_converted(train=True, batch_size=4, reshape=False)
     batch_conv1 = next(iter(bconvb1))
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG: !!!batch_conv1[1]=%s" % (batch_conv1[1],), file=sys.stderr)
     assert len(batch_conv1) == 4
     # TODO: check why some indices changed between previously and now and if this is till correct!
     assert batch_conv1[1] == [[
         1210, 1495, 9, 796, 23, 3075, 6, 3, 3, 3, 2, 3, 2, 2, 20, 54, 0,
         86, 0, 2, 0, 391, 0, 300, 0, 77
     ], 0]
     bconvb2 = ds.batches_converted(train=True, batch_size=4, reshape=True)
     batch_conv2 = next(iter(bconvb2))
     # print("DEBUG: batch_conv2=%s" % (batch_conv2,), file=sys.stderr)
     assert len(batch_conv2) == 2
     featurelist1 = batch_conv2[0]
     feature1 = featurelist1[0]
     assert feature1[1] == 1210
예제 #16
0
 def test_t6(self):
     # logger.info("Running Tests4Dataset1test1/test_t6")
     ds = Dataset(TESTFILE2)
     ds.split(convert=True,
              keep_orig=True,
              validation_size=3,
              random_seed=1)
     # check if getting the batches and validation sets works
     valset_orig = ds.validation_set_orig()
     # print("DEBUG: valset_orig=%s" % valset_orig, file=sys.stderr)
     assert len(valset_orig) == 3
     vorigi2 = valset_orig[1]
     assert vorigi2 == [[[
         'a', 'very', 'well-made', ',', 'funny', 'and', 'entertaining',
         'picture', '.'
     ]], 'pos']
     valset_conv = ds.validation_set_converted()
     # print("DEBUG: valset_conv=%s" % valset_conv, file=sys.stderr)
     assert len(valset_conv) == 3
     vconvi2 = valset_conv[1]
     # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! DEBUG: vconvi2=", vconvi2, file=sys.stderr)
     assert vconvi2 == [[[5, 84, 1530, 4, 75, 6, 190, 175, 2]], 1]
     valset_conv_b = ds.validation_set_converted(as_batch=True)
     # print("DEBUG: valset_conv_b=%s" % (valset_conv_b,), file=sys.stderr)
     # we expect a tuple for indep and dep
     assert len(valset_conv_b) == 2
     indep1, dep1 = valset_conv_b
     # the indep part should now have lenth one because there is only one feature
     assert len(indep1) == 1
     # there should be 3 values for that first feature
     # print("DEBUG: indep1[0]=%r" % (indep1[0]), file=sys.stderr)
     assert len(indep1[0]) == 3
     # get a batch of original data
     bitb1 = ds.batches_original(train=True, batch_size=4, reshape=False)
     batch_orig1 = next(iter(bitb1))
     # print("DEBUG: batch_orig1=%s" % (batch_orig1,), file=sys.stderr)
     # if reshape was False, this is just a list of instances in original format
     assert len(batch_orig1) == 4
     assert batch_orig1[1] == [[[
         'rife', 'with', 'nutty', 'cliches', 'and', 'far', 'too', 'much',
         'dialogue', '.'
     ]], 'neg']
     bitb2 = ds.batches_original(train=True, batch_size=4, reshape=True)
     batch_orig2 = next(iter(bitb2))
     # print("DEBUG: batch_orig2=%s" % (batch_orig2,), file=sys.stderr)
     # if reshape was True, this is a tuple where the first element is the list of features
     assert len(batch_orig2) == 2
     featurelist1 = batch_orig2[0]
     feature1 = featurelist1[0]
     # print("DEBUG: feature1[1]=%s" % (feature1[1],), file=sys.stderr)
     assert feature1[1] == [
         'rife', 'with', 'nutty', 'cliches', 'and', 'far', 'too', 'much',
         'dialogue', '.', '', '', '', '', '', '', '', '', '', '', '', '',
         '', '', '', '', '', '', '', '', '', '', '', '', '', ''
     ]
     bconvb1 = ds.batches_converted(train=True, batch_size=4, reshape=False)
     batch_conv1 = next(iter(bconvb1))
     # print("DEBUG: batch_conv1=%s" % (batch_conv1,), file=sys.stderr)
     assert len(batch_conv1) == 4
     # print("DEBUG: batch_conv1[1]=%s" % (batch_conv1[1],), file=sys.stderr)
     assert batch_conv1[1] == [[[
         6694, 17, 6469, 544, 6, 168, 51, 59, 237, 2
     ]], 0]
     bconvb2 = ds.batches_converted(train=True, batch_size=4, reshape=True)
     batch_conv2 = next(iter(bconvb2))
     # print("DEBUG: batch_conv2=%s" % (batch_conv2,), file=sys.stderr)
     assert len(batch_conv2) == 2
     featurelist1 = batch_conv2[0]
     feature1 = featurelist1[0]
     # print("DEBUG: feature1[1]=%s" % (feature1[1],), file=sys.stderr)
     assert feature1[1] == [
         6694, 17, 6469, 544, 6, 168, 51, 59, 237, 2, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     ]
예제 #17
0
 def test_t1_2(selfs):
     # logger.info("Running Tests4Dataset1test1/test_t1_2")
     # check a simple sequence dataset with just one nominal feature without any embeddings definitions
     ds = Dataset(TESTFILE5)
     feats = ds.features
     vocf1 = feats[0].vocab