def test_dump_concise(): one = 1 two = 2.1 three = 3.01 exact = 1.000000000000001 # loses the last decimal place almost = 1.0000000000000001 X = [[one, two, three, exact, almost], [1e9, 2e18, 3e27, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] y = [one, two, three, exact, almost] f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) # make sure it's using the most concise format possible assert_equal(f.readline(), b("1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n")) assert_equal(f.readline(), b("2.1 0:1000000000 1:2e+18 2:3e+27\n")) assert_equal(f.readline(), b("3.01 \n")) assert_equal(f.readline(), b("1.000000000000001 \n")) assert_equal(f.readline(), b("1 \n")) f.seek(0) # make sure it's correct too :) X2, y2 = load_svmlight_file(f) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2)
def test_load_zero_based_auto(): data1 = b("-1 1:1 2:2 3:3\n") data2 = b("-1 0:0 1:1\n") f1 = BytesIO(data1) X, y = load_svmlight_file(f1, zero_based="auto") assert_equal(X.shape, (1, 3)) f1 = BytesIO(data1) f2 = BytesIO(data2) X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto") assert_equal(X1.shape, (1, 4)) assert_equal(X2.shape, (1, 4))
def test_dump_multilabel(): X = [[1, 0, 3, 0, 5], [0, 0, 0, 0, 0], [0, 5, 0, 1, 0]] y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]] y_sparse = sp.csr_matrix(y_dense) for y in [y_dense, y_sparse]: f = BytesIO() dump_svmlight_file(X, y, f, multilabel=True) f.seek(0) # make sure it dumps multilabel correctly assert_equal(f.readline(), b("1 0:1 2:3 4:5\n")) assert_equal(f.readline(), b("0,2 \n")) assert_equal(f.readline(), b("0,1 1:5 3:1\n"))
def test_dump_comment(): X, y = load_svmlight_file(datafile) X = X.toarray() f = BytesIO() ascii_comment = "This is a comment\nspanning multiple lines." dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2) # XXX we have to update this to support Python 3.x utf8_comment = b("It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc") f = BytesIO() assert_raises(UnicodeDecodeError, dump_svmlight_file, X, y, f, comment=utf8_comment) unicode_comment = utf8_comment.decode("utf-8") f = BytesIO() dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_equal(y, y2) f = BytesIO() assert_raises(ValueError, dump_svmlight_file, X, y, f, comment="I've got a \0.")
def test_default_load_files(test_category_dir_1, test_category_dir_2, load_files_root): res = load_files(load_files_root) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")])
def test_load_with_long_qid(): # load svmfile with longint qid attribute data = b(""" 1 qid:0 0:1 1:2 2:3 0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985 0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985 3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985""") X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) true_X = [[1, 2, 3], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985]] true_y = [1, 0, 0, 3] trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807] assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f = BytesIO() dump_svmlight_file(X, y, f, query_id=qid, zero_based=True) f.seek(0) X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f.seek(0) X, y = load_svmlight_file(f, query_id=False, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X)
def setup_load_files(): global TEST_CATEGORY_DIR1 global TEST_CATEGORY_DIR2 TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT) TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT) sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, delete=False) sample_file.write(b("Hello World!\n")) sample_file.close()
def test_category_dir_1(load_files_root): test_category_dir1 = tempfile.mkdtemp(dir=load_files_root) sample_file = tempfile.NamedTemporaryFile(dir=test_category_dir1, delete=False) sample_file.write(b("Hello World!\n")) sample_file.close() yield str(test_category_dir1) _remove_dir(test_category_dir1)
def test_default_load_files(): try: setup_load_files() res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")]) finally: teardown_load_files()
def _dump_svmlight_dense(X, y, f, one_based, comment, query_id): is_sp = int(hasattr(X, "tocsr")) if X.dtype.kind == 'i': value_pattern = u("%d:%d") else: value_pattern = u("%d:%.16g") if y.dtype.kind == 'i': line_pattern = u("%d") else: line_pattern = u("%.16g") if query_id is not None: line_pattern += u(" qid:%d") line_pattern += u(" %s\n") if comment: f.write(b("# Generated by dump_svmlight_file from scikit-learn %s\n" % __version__)) f.write(b("# Column indices are %s-based\n" % ["zero", "one"][one_based])) f.write(b("#\n")) f.writelines(b("# %s\n" % line) for line in comment.splitlines()) for i in range(X.shape[0]): if is_sp: #print 'is_sp' span = slice(X.indptr[i], X.indptr[i + 1]) row = zip(X.indices[span], X.data[span]) else: #nz = X[i] != 0 #row = zip(np.where(nz)[0], X[i, nz]) row = [(j,X[i][j]) for j in range(len(X[i]))] #print row s = " ".join(value_pattern % (j + one_based, x) for j, x in row) if query_id is not None: feat = (y[i], query_id[i], s) else: feat = (y[i], s) f.write((line_pattern % feat).encode('ascii'))
def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None, color=False, resize=None): """Perform the actual data loading for the LFW pairs dataset This operation is meant to be cached by a joblib wrapper. """ # parse the index file to find the number of pairs to be able to allocate # the right amount of memory before starting to decode the jpeg files with open(index_file_path, 'rb') as index_file: split_lines = [ln.strip().split(b('\t')) for ln in index_file] pair_specs = [sl for sl in split_lines if len(sl) > 2] n_pairs = len(pair_specs) # interating over the metadata lines for each pair to find the filename to # decode and load in memory target = np.zeros(n_pairs, dtype=np.int) file_paths = list() for i, components in enumerate(pair_specs): if len(components) == 3: target[i] = 1 pair = ( (components[0], int(components[1]) - 1), (components[0], int(components[2]) - 1), ) elif len(components) == 4: target[i] = 0 pair = ( (components[0], int(components[1]) - 1), (components[2], int(components[3]) - 1), ) else: raise ValueError("invalid line %d: %r" % (i + 1, components)) for j, (name, idx) in enumerate(pair): try: person_folder = join(data_folder_path, name) except TypeError: person_folder = join(data_folder_path, str(name, 'UTF-8')) filenames = list(sorted(listdir(person_folder))) file_path = join(person_folder, filenames[idx]) file_paths.append(file_path) pairs = _load_imgs(file_paths, slice_, color, resize) shape = list(pairs.shape) n_faces = shape.pop(0) shape.insert(0, 2) shape.insert(0, n_faces // 2) pairs.shape = shape return pairs, target, np.array(['Different persons', 'Same person'])
def setup_module(): """Test fixture run once and common to all tests of this module""" if not pillow_installed: raise SkipTest("PIL not installed.") if not os.path.exists(LFW_HOME): os.makedirs(LFW_HOME) random_state = random.Random(42) np_rng = np.random.RandomState(42) # generate some random jpeg files for each person counts = {} for name in FAKE_NAMES: folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name) if not os.path.exists(folder_name): os.makedirs(folder_name) n_faces = np_rng.randint(1, 5) counts[name] = n_faces for i in range(n_faces): file_path = os.path.join(folder_name, name + '_%04d.jpg' % i) uniface = np_rng.randint(0, 255, size=(250, 250, 3)) try: imsave(file_path, uniface) except ImportError: raise SkipTest("PIL not installed") # add some random file pollution to test robustness with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f: f.write(six.b('Text file to be ignored by the dataset loader.')) # generate some pairing metadata files using the same format as LFW with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f: f.write(six.b("10\n")) more_than_two = [ name for name, count in six.iteritems(counts) if count >= 2 ] for i in range(5): name = random_state.choice(more_than_two) first, second = random_state.sample(range(counts[name]), 2) f.write(six.b('%s\t%d\t%d\n' % (name, first, second))) for i in range(5): first_name, second_name = random_state.sample(FAKE_NAMES, 2) first_index = random_state.choice(np.arange(counts[first_name])) second_index = random_state.choice(np.arange(counts[second_name])) f.write( six.b('%s\t%d\t%s\t%d\n' % (first_name, first_index, second_name, second_index))) with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f: f.write(six.b("Fake place holder that won't be tested")) with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f: f.write(six.b("Fake place holder that won't be tested"))
def test_load_with_qid(): # load svmfile with qid attribute data = b(""" 3 qid:1 1:0.53 2:0.12 2 qid:1 1:0.13 2:0.1 7 qid:2 1:0.87 2:0.12""") X, y = load_svmlight_file(BytesIO(data), query_id=False) assert_array_equal(y, [3, 2, 7]) assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]]) res1 = load_svmlight_files([BytesIO(data)], query_id=True) res2 = load_svmlight_file(BytesIO(data), query_id=True) for X, y, qid in (res1, res2): assert_array_equal(y, [3, 2, 7]) assert_array_equal(qid, [1, 1, 2]) assert_array_equal(X.todense(), [[.53, .12], [.13, .1], [.87, .12]])
def setup_module(): """Test fixture run once and common to all tests of this module""" if imsave is None: raise SkipTest if not os.path.exists(LFW_HOME): os.makedirs(LFW_HOME) random_state = random.Random(42) np_rng = np.random.RandomState(42) # generate some random jpeg files for each person counts = {} for name in FAKE_NAMES: folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name) if not os.path.exists(folder_name): os.makedirs(folder_name) n_faces = np_rng.randint(1, 5) counts[name] = n_faces for i in range(n_faces): file_path = os.path.join(folder_name, name + '_%04d.jpg' % i) uniface = np_rng.randint(0, 255, size=(250, 250, 3)) try: imsave(file_path, uniface) except ImportError: # PIL is not properly installed, skip those tests raise SkipTest # add some random file pollution to test robustness with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f: f.write(six.b('Text file to be ignored by the dataset loader.')) # generate some pairing metadata files using the same format as LFW with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f: f.write(six.b("10\n")) more_than_two = [name for name, count in six.iteritems(counts) if count >= 2] for i in range(5): name = random_state.choice(more_than_two) first, second = random_state.sample(range(counts[name]), 2) f.write(six.b('%s\t%d\t%d\n' % (name, first, second))) for i in range(5): first_name, second_name = random_state.sample(FAKE_NAMES, 2) first_index = random_state.choice(np.arange(counts[first_name])) second_index = random_state.choice(np.arange(counts[second_name])) f.write(six.b('%s\t%d\t%s\t%d\n' % (first_name, first_index, second_name, second_index))) with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f: f.write(six.b("Fake place holder that won't be tested")) with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f: f.write(six.b("Fake place holder that won't be tested"))
def test_default_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")])
def test_load_zero_based(): f = BytesIO(b("-1 4:1.\n1 0:1\n")) load_svmlight_file(f, zero_based=False)
def test_mmhash3_bytes(): assert_equal(murmurhash3_32(b('foo'), 0), -156908512) assert_equal(murmurhash3_32(b('foo'), 42), -1322301282) assert_equal(murmurhash3_32(b('foo'), 0, positive=True), 4138058784) assert_equal(murmurhash3_32(b('foo'), 42, positive=True), 2972666014)
def test_load_zero_based(): f = BytesIO(b("-1 4:1.\n1 0:1\n")) assert_raises(ValueError, load_svmlight_file, f, zero_based=False)