def test_init_2(self): mf = MatchingField() seq = 'Hello, This is a test sequence for tokenizer.' tok_seq = [ 'Hello', ',', 'This', 'is', 'a', 'test', 'sequence', 'for', 'tokenizer', '.' ] self.assertEqual(mf.tokenize(seq), tok_seq)
def test_preprocess_args_1(self): mf = MatchingField() arg_dict = mf.preprocess_args() res_dict = {'sequential': True, 'init_token': None, 'eos_token': None, 'init_token': None, 'lower': False, 'preprocessing': None, 'sequential': True, 'tokenizer_arg': 'moses', 'unk_token': '<unk>'} self.assertEqual(arg_dict, res_dict)
def test_init_1(self): fields = [('left_a', MatchingField()), ('right_a', MatchingField())] col_naming = {'id':'id', 'label':'label', 'left':'left', 'right':'right'} path = os.path.join('.', 'test_datasets', 'sample_table_small.csv') md = MatchingDataset(fields, col_naming, path=path) self.assertEqual(md.id_field, 'id') self.assertEqual(md.label_field, 'label') self.assertEqual(md.all_left_fields, ['left_a']) self.assertEqual(md.all_right_fields, ['right_a']) self.assertEqual(md.all_text_fields, ['left_a', 'right_a']) self.assertEqual(md.canonical_text_fields, ['_a'])
def test_extend_vectors_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec' file = os.path.join(pathdir, filename) url_base = urljoin('file:', pathname2url(file)) vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base) self.assertIsInstance(vecs, Vectors) vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir) v = MatchingVocab(Counter()) v.vectors = torch.Tensor(1, vec_data[0].dim) v.unk_init = torch.Tensor.zero_ tokens = {'hello', 'world'} v.extend_vectors(tokens, vec_data) self.assertEqual(len(v.itos), 4) self.assertEqual(v.vectors.size(), torch.Size([4, 300])) self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10) self.assertEqual(list(v.vectors[3][0:10]), [0.0] * 10) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def test_class_matching_dataset(): fields = [("left_a", MatchingField()), ("right_a", MatchingField())] col_naming = { "id": "id", "label": "label", "left": "left", "right": "right" } path = os.path.join(test_dir_path, "test_datasets", "sample_table_small.csv") md = MatchingDataset(fields, col_naming, path=path) assert md.id_field == "id" assert md.label_field == "label" assert md.all_left_fields == ["left_a"] assert md.all_right_fields == ["right_a"] assert md.all_text_fields == ["left_a", "right_a"] assert md.canonical_text_fields == ["_a"]
def test_extend_vocab_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) mf = MatchingField() lf = MatchingField(id=True, sequential=False) fields = [('id', lf), ('left_a', mf), ('right_a', mf), ('label', lf)] col_naming = { 'id': 'id', 'label': 'label', 'left': 'left_', 'right': 'right_' } pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec' file = os.path.join(pathdir, filename) url_base = urljoin('file:', pathname2url(file)) vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base) data_path = os.path.join(test_dir_path, 'test_datasets', 'sample_table_small.csv') md = MatchingDataset(fields, col_naming, path=data_path) mf.build_vocab() mf.vocab.vectors = torch.Tensor(len(mf.vocab.itos), 300) mf.extend_vocab(md, vectors=vecs) self.assertEqual(len(mf.vocab.itos), 6) self.assertEqual(mf.vocab.vectors.size(), torch.Size([6, 300]))
def _make_fields(header, id_attr, label_attr, ignore_columns, lower, tokenize, include_lengths): """Create field metadata, i.e., attribute processing specification for each attribute. This includes fields for label and ID columns. Returns: list(tuple(str, MatchingField)): A list of tuples containing column name (e.g. "left_address") and corresponding :class:`~data.MatchingField` pairs, in the same order that the columns occur in the CSV file. """ text_field = MatchingField( lower=lower, tokenize=tokenize, init_token="<<<", eos_token=">>>", batch_first=True, include_lengths=include_lengths, ) numeric_field = MatchingField(sequential=False, preprocessing=lambda x: int(x), use_vocab=False) id_field = MatchingField(sequential=False, use_vocab=False, id=True) fields = [] for attr in header: if attr == id_attr: fields.append((attr, id_field)) elif attr == label_attr: fields.append((attr, numeric_field)) elif attr in ignore_columns: fields.append((attr, None)) else: fields.append((attr, text_field)) return fields
def test_get_vector_data(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) pathdir = os.path.abspath(os.path.join('.', 'test_datasets')) filename = 'fasttext_sample.vec' file = os.path.join(pathdir, filename) url_base = urljoin('file:', pathname2url(file)) vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base) self.assertIsInstance(vecs, Vectors) vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir) self.assertEqual(len(vec_data), 1) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def test_init_1(self): mf = MatchingField() self.assertTrue(mf.sequential)
def test_reset_vector_cache_1(self): mf = MatchingField() reset_vector_cache() self.assertDictEqual(mf._cached_vec_data, {})
def test_numericalize_2(self): mf = MatchingField() arr = [['a'], ['b'], ['c']] mf.numericalize(arr)
def test_numericalize_1(self): mf = MatchingField(id=True) arr = [[1], [2], [3]] mf.numericalize(arr) self.assertEqual(arr, [[1], [2], [3]])
def test_build_vocab_3(self): mf = MatchingField() vector_file_name = 'fasttext.crawl_test.vec' cache_dir = os.path.join(test_dir_path, 'test_datasets') vec_data = mf.build_vocab(vectors=vector_file_name, cache=cache_dir) self.assertIsNone(vec_data)
def test_build_vocab_2(self): mf = MatchingField() vector_file_name = 'fasttext.wiki_test.vec' cache_dir = os.path.join(test_dir_path, 'test_datasets') vec_data = mf.build_vocab(vectors=vector_file_name, cache=cache_dir)
def test_build_vocab_1(self): mf = MatchingField() mf.build_vocab()
def test_init_3(self): mf = MatchingField(tokenize='random string')