def fields(id_attr, label_attr, data_dir, train_filename): ignore_columns = ["left_id", "right_id"] with io.open(os.path.expanduser(os.path.join(data_dir, train_filename)), encoding="utf8") as f: header = next(unicode_csv_reader(f)) return _make_fields(header, id_attr, label_attr, ignore_columns, True, "nltk", False)
def test_make_fields_1(): path = os.path.join(test_dir_path, "test_datasets") a_dataset = "sample_table_large.csv" with io.open(os.path.expanduser(os.path.join(path, a_dataset)), encoding="utf8") as f: header = next(unicode_csv_reader(f)) assert header == [ "_id", "ltable_id", "rtable_id", "label", "ltable_Song_Name", "ltable_Artist_Name", "ltable_Price", "ltable_Released", "rtable_Song_Name", "rtable_Artist_Name", "rtable_Price", "rtable_Released", ] id_attr = "_id" label_attr = "label" fields = _make_fields(header, id_attr, label_attr, ["ltable_id", "rtable_id"], True, "nltk", True) assert len(fields) == 12 counter = {} for tup in fields: if tup[1] not in counter: counter[tup[1]] = 0 counter[tup[1]] += 1 assert sorted(list(counter.values())) == [1, 1, 2, 8]
def setUp(self): self.data_dir = os.path.join(test_dir_path, 'test_datasets') self.train = 'test_train.csv' self.validation = 'test_valid.csv' self.test = 'test_test.csv' self.cache_name = 'test_cacheddata.pth' with io.open(os.path.expanduser(os.path.join(self.data_dir, self.train)), encoding="utf8") as f: header = next(unicode_csv_reader(f)) id_attr = 'id' label_attr = 'label' ignore_columns = ['left_id', 'right_id'] self.fields = _make_fields(header, id_attr, label_attr, ignore_columns, True, 'nltk', False) self.column_naming = { 'id': id_attr, 'left': 'left_', 'right': 'right_', 'label': label_attr }
def test_make_fields_1(self): path = os.path.join(test_dir_path, 'test_datasets') a_dataset = 'sample_table_large.csv' with io.open(os.path.expanduser(os.path.join(path, a_dataset)), encoding="utf8") as f: header = next(unicode_csv_reader(f)) self.assertEqual(header, [ '_id', 'ltable_id', 'rtable_id', 'label', 'ltable_Song_Name', 'ltable_Artist_Name', 'ltable_Price', 'ltable_Released', 'rtable_Song_Name', 'rtable_Artist_Name', 'rtable_Price', 'rtable_Released' ]) id_attr = '_id' label_attr = 'label' fields = _make_fields(header, id_attr, label_attr, ['ltable_id', 'rtable_id'], True, 'nltk', True) self.assertEqual(len(fields), 12) counter = {} for tup in fields: if tup[1] not in counter: counter[tup[1]] = 0 counter[tup[1]] += 1 self.assertEqual(sorted(list(counter.values())), [1, 1, 2, 8])