示例#1
0
def fields(id_attr, label_attr, data_dir, train_filename):
    ignore_columns = ["left_id", "right_id"]
    with io.open(os.path.expanduser(os.path.join(data_dir, train_filename)),
                 encoding="utf8") as f:
        header = next(unicode_csv_reader(f))
    return _make_fields(header, id_attr, label_attr, ignore_columns, True,
                        "nltk", False)
示例#2
0
def test_make_fields_1():
    path = os.path.join(test_dir_path, "test_datasets")
    a_dataset = "sample_table_large.csv"
    with io.open(os.path.expanduser(os.path.join(path, a_dataset)),
                 encoding="utf8") as f:
        header = next(unicode_csv_reader(f))
    assert header == [
        "_id",
        "ltable_id",
        "rtable_id",
        "label",
        "ltable_Song_Name",
        "ltable_Artist_Name",
        "ltable_Price",
        "ltable_Released",
        "rtable_Song_Name",
        "rtable_Artist_Name",
        "rtable_Price",
        "rtable_Released",
    ]
    id_attr = "_id"
    label_attr = "label"
    fields = _make_fields(header, id_attr, label_attr,
                          ["ltable_id", "rtable_id"], True, "nltk", True)
    assert len(fields) == 12
    counter = {}
    for tup in fields:
        if tup[1] not in counter:
            counter[tup[1]] = 0
        counter[tup[1]] += 1
    assert sorted(list(counter.values())) == [1, 1, 2, 8]
示例#3
0
    def setUp(self):
        self.data_dir = os.path.join(test_dir_path, 'test_datasets')
        self.train = 'test_train.csv'
        self.validation = 'test_valid.csv'
        self.test = 'test_test.csv'
        self.cache_name = 'test_cacheddata.pth'
        with io.open(os.path.expanduser(os.path.join(self.data_dir,
                                                     self.train)),
                     encoding="utf8") as f:
            header = next(unicode_csv_reader(f))

        id_attr = 'id'
        label_attr = 'label'
        ignore_columns = ['left_id', 'right_id']
        self.fields = _make_fields(header, id_attr, label_attr, ignore_columns,
                                   True, 'nltk', False)

        self.column_naming = {
            'id': id_attr,
            'left': 'left_',
            'right': 'right_',
            'label': label_attr
        }
示例#4
0
 def test_make_fields_1(self):
     path = os.path.join(test_dir_path, 'test_datasets')
     a_dataset = 'sample_table_large.csv'
     with io.open(os.path.expanduser(os.path.join(path, a_dataset)),
                  encoding="utf8") as f:
         header = next(unicode_csv_reader(f))
     self.assertEqual(header, [
         '_id', 'ltable_id', 'rtable_id', 'label', 'ltable_Song_Name',
         'ltable_Artist_Name', 'ltable_Price', 'ltable_Released',
         'rtable_Song_Name', 'rtable_Artist_Name', 'rtable_Price',
         'rtable_Released'
     ])
     id_attr = '_id'
     label_attr = 'label'
     fields = _make_fields(header, id_attr, label_attr,
                           ['ltable_id', 'rtable_id'], True, 'nltk', True)
     self.assertEqual(len(fields), 12)
     counter = {}
     for tup in fields:
         if tup[1] not in counter:
             counter[tup[1]] = 0
         counter[tup[1]] += 1
     self.assertEqual(sorted(list(counter.values())), [1, 1, 2, 8])