예제 #1
0
 def test_init_2(self):
     mf = MatchingField()
     seq = 'Hello, This is a test sequence for tokenizer.'
     tok_seq = [
         'Hello', ',', 'This', 'is', 'a', 'test', 'sequence', 'for', 'tokenizer', '.'
     ]
     self.assertEqual(mf.tokenize(seq), tok_seq)
예제 #2
0
 def test_preprocess_args_1(self):
     mf = MatchingField()
     arg_dict = mf.preprocess_args()
     res_dict = {'sequential': True, 'init_token': None,
                 'eos_token': None, 'init_token': None,
                 'lower': False, 'preprocessing': None,
                 'sequential': True, 'tokenizer_arg': 'moses',
                 'unk_token': '<unk>'}
     self.assertEqual(arg_dict, res_dict)
예제 #3
0
 def test_init_1(self):
     fields = [('left_a', MatchingField()), ('right_a', MatchingField())]
     col_naming = {'id':'id', 'label':'label', 'left':'left',
                   'right':'right'}
     path = os.path.join('.', 'test_datasets', 'sample_table_small.csv')
     md = MatchingDataset(fields, col_naming, path=path)
     self.assertEqual(md.id_field, 'id')
     self.assertEqual(md.label_field, 'label')
     self.assertEqual(md.all_left_fields, ['left_a'])
     self.assertEqual(md.all_right_fields, ['right_a'])
     self.assertEqual(md.all_text_fields, ['left_a', 'right_a'])
     self.assertEqual(md.canonical_text_fields, ['_a'])
예제 #4
0
    def test_extend_vectors_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec'
        file = os.path.join(pathdir, filename)
        url_base = urljoin('file:', pathname2url(file))
        vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base)
        self.assertIsInstance(vecs, Vectors)

        vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir)
        v = MatchingVocab(Counter())
        v.vectors = torch.Tensor(1, vec_data[0].dim)
        v.unk_init = torch.Tensor.zero_
        tokens = {'hello', 'world'}
        v.extend_vectors(tokens, vec_data)
        self.assertEqual(len(v.itos), 4)
        self.assertEqual(v.vectors.size(), torch.Size([4, 300]))
        self.assertEqual(list(v.vectors[2][0:10]), [0.0] * 10)
        self.assertEqual(list(v.vectors[3][0:10]), [0.0] * 10)

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)
예제 #5
0
def test_class_matching_dataset():
    fields = [("left_a", MatchingField()), ("right_a", MatchingField())]
    col_naming = {
        "id": "id",
        "label": "label",
        "left": "left",
        "right": "right"
    }
    path = os.path.join(test_dir_path, "test_datasets",
                        "sample_table_small.csv")
    md = MatchingDataset(fields, col_naming, path=path)
    assert md.id_field == "id"
    assert md.label_field == "label"
    assert md.all_left_fields == ["left_a"]
    assert md.all_right_fields == ["right_a"]
    assert md.all_text_fields == ["left_a", "right_a"]
    assert md.canonical_text_fields == ["_a"]
예제 #6
0
    def test_extend_vocab_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        mf = MatchingField()
        lf = MatchingField(id=True, sequential=False)
        fields = [('id', lf), ('left_a', mf), ('right_a', mf), ('label', lf)]
        col_naming = {
            'id': 'id',
            'label': 'label',
            'left': 'left_',
            'right': 'right_'
        }

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec'
        file = os.path.join(pathdir, filename)
        url_base = urljoin('file:', pathname2url(file))
        vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base)

        data_path = os.path.join(test_dir_path, 'test_datasets',
                                 'sample_table_small.csv')
        md = MatchingDataset(fields, col_naming, path=data_path)

        mf.build_vocab()
        mf.vocab.vectors = torch.Tensor(len(mf.vocab.itos), 300)
        mf.extend_vocab(md, vectors=vecs)
        self.assertEqual(len(mf.vocab.itos), 6)
        self.assertEqual(mf.vocab.vectors.size(), torch.Size([6, 300]))
예제 #7
0
def _make_fields(header, id_attr, label_attr, ignore_columns, lower, tokenize,
                 include_lengths):
    """Create field metadata, i.e., attribute processing specification for each attribute.

    This includes fields for label and ID columns.

    Returns:
        list(tuple(str, MatchingField)): A list of tuples containing column name
            (e.g. "left_address") and corresponding :class:`~data.MatchingField` pairs,
            in the same order that the columns occur in the CSV file.

    """

    text_field = MatchingField(
        lower=lower,
        tokenize=tokenize,
        init_token="<<<",
        eos_token=">>>",
        batch_first=True,
        include_lengths=include_lengths,
    )
    numeric_field = MatchingField(sequential=False,
                                  preprocessing=lambda x: int(x),
                                  use_vocab=False)
    id_field = MatchingField(sequential=False, use_vocab=False, id=True)

    fields = []
    for attr in header:
        if attr == id_attr:
            fields.append((attr, id_field))
        elif attr == label_attr:
            fields.append((attr, numeric_field))
        elif attr in ignore_columns:
            fields.append((attr, None))
        else:
            fields.append((attr, text_field))
    return fields
예제 #8
0
    def test_get_vector_data(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        pathdir = os.path.abspath(os.path.join('.', 'test_datasets'))
        filename = 'fasttext_sample.vec'
        file = os.path.join(pathdir, filename)
        url_base = urljoin('file:', pathname2url(file))
        vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base)
        self.assertIsInstance(vecs, Vectors)

        vec_data = MatchingField._get_vector_data(vecs, vectors_cache_dir)
        self.assertEqual(len(vec_data), 1)

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)
예제 #9
0
 def test_init_1(self):
     mf = MatchingField()
     self.assertTrue(mf.sequential)
예제 #10
0
 def test_reset_vector_cache_1(self):
     mf = MatchingField()
     reset_vector_cache()
     self.assertDictEqual(mf._cached_vec_data, {})
예제 #11
0
 def test_numericalize_2(self):
     mf = MatchingField()
     arr = [['a'], ['b'], ['c']]
     mf.numericalize(arr)
예제 #12
0
 def test_numericalize_1(self):
     mf = MatchingField(id=True)
     arr = [[1], [2], [3]]
     mf.numericalize(arr)
     self.assertEqual(arr, [[1], [2], [3]])
예제 #13
0
 def test_build_vocab_3(self):
     mf = MatchingField()
     vector_file_name = 'fasttext.crawl_test.vec'
     cache_dir = os.path.join(test_dir_path, 'test_datasets')
     vec_data = mf.build_vocab(vectors=vector_file_name, cache=cache_dir)
     self.assertIsNone(vec_data)
예제 #14
0
 def test_build_vocab_2(self):
     mf = MatchingField()
     vector_file_name = 'fasttext.wiki_test.vec'
     cache_dir = os.path.join(test_dir_path, 'test_datasets')
     vec_data = mf.build_vocab(vectors=vector_file_name, cache=cache_dir)
예제 #15
0
 def test_build_vocab_1(self):
     mf = MatchingField()
     mf.build_vocab()
예제 #16
0
 def test_init_3(self):
     mf = MatchingField(tokenize='random string')