def test_t3(self): # logger.info("Running Tests4Dataset1test1/test_t3") ds = Dataset(TESTFILE2) it0 = iter(ds.instances_original()) inst0 = next(it0) indep, dep = inst0 # print("DEBUG: indep=", indep, file=sys.stderr) assert indep == [['invincible', 'is', 'a', 'wonderful', 'movie', '.']] assert dep == "pos" # check low level conversion methods first inst1 = [[['invincible', 'is', 'a', 'wonderful', 'movie', '.']], 'pos'] (indep1, dep1) = inst1 indep1_conv = ds.features(indep1) logger.debug("Original indep1=%r", indep1) logger.debug("Converted indep1=%r", indep1_conv) ngram1 = indep1_conv[0] assert len(ngram1) == 6 # print("DEBUG ngram1[0]=", ngram1[0], file=sys.stderr) assert ngram1[0] == 3543 assert ngram1[1] == 9 it1 = iter(ds.instances_converted(train=False, convert=True)) rec = next(it1) logger.debug("TESTFILE2 rec1=%r", rec) logger.debug("TESTFILE2 info=%r" % ds.get_info()) (indep1_it, dep1_it) = rec ngram1_it = indep1_it[0] logger.debug("TESTFILE2 dep_it=%r", dep1_it) # print("DEBUG dep1_it=", dep1_it, file=sys.stderr) assert len(ngram1_it) == 6 assert ngram1_it[0] == 3543 assert ngram1_it[1] == 9 assert dep1_it == 1
def test_t5(self): # logger.info("Running Tests4Dataset1test1/test_t5") ds = Dataset(TESTFILE4) it1 = iter(ds.instances_converted(train=False, convert=True)) rec = next(it1) indep, dep = rec logger.debug("TESTFILE4: indep=%r" % indep) logger.debug("TESTFILE4: dep=%r" % dep) logger.debug("TESTFILE4 info=%r" % ds.get_info()) # the first row is a sequence of 3 elements, with 18 independent # features and one of 17 different targets # so we should convert this into 18 features which each now should have 3 values # and 3 onehot vectors for the class assert len(dep) == 3 assert len(indep) == 3 # 3 elements in the sequence assert len(indep[0]) == 18 assert len(indep[1]) == 18 assert len(indep[2]) == 18 # check if the class is actually ADJ for all three targets dep1 = dep[0] dep2 = dep[1] dep3 = dep[2] t11 = ds.target.vocab.idx2string(dep1) assert t11 == "ADJ" t12 = ds.target.vocab.idx2string(dep2) assert t12 == "ADJ" t13 = ds.target.vocab.idx2string(dep3) assert t13 == "ADJ" # test getting batches in non-reshaped form bit1 = ds.batches_converted(train=False, convert=True, batch_size=2, reshape=False) biter1 = iter(bit1) batch1 = next(biter1) # print("DEBUG: TESTFILE4 batch/noreshape=%s" % (batch1,), file=sys.stderr) assert len(batch1) == 2 # test getting batches in reshaped form bit2 = ds.batches_converted(train=False, convert=True, batch_size=2, reshape=True) biter2 = iter(bit2) batch2 = next(biter2) # print("DEBUG: TESTFILE4 batch/noreshape=%s" % (batch1,), file=sys.stderr) bindep, bdep = batch2 assert len(bindep) == 18 assert len(bdep) == 2 assert len(bindep[0]) == 2
def test_t2(self): # logger.info("Running Tests4Dataset1test1/test_t2") ds = Dataset(TESTFILE1) features = ds.features s = features.size() assert s == 34 it1 = iter(ds.instances_converted(train=False, convert=True)) rec = next(it1) logger.debug("TESTFILE1 info=%r" % ds.get_info()) logger.debug("TESTFILE1 rec1=%r" % rec) # we expect rec to be a pair: indep and dep indep, dep = rec # print("DEBUG: rec=", rec, file=sys.stderr) # the indep part has as many values as there are features here assert len(indep) == 34 # the dep part is the encoding for two nominal classes, assert dep == 0