def test_assymetric_U(self): """ Should raise error when U is assymetric""" N = 100 X, X_m = datasets.make_s_curve(N, random_state=0) Y, Y_m = datasets.make_s_curve(N, random_state=1) # make some assymetric matching matrices U_1 = utils.gaussian_similarity_kernel(X_m.reshape(N, 1), Y_m.reshape(N, 1), 1.0) U_2 = utils.gaussian_similarity_kernel(Y_m.reshape(N, 1), X_m.reshape(N, 1), 0.5) assert_raises(ValueError, laplacian_manifold_align, [X, Y], [[None, U_1], [U_2, None]], 5, 0.1, 0.1)
def test_make_s_curve(): X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0) assert_equal(X.shape, (5, 3), "X shape mismatch") assert_equal(t.shape, (5,), "t shape mismatch") assert_array_equal(X[:, 0], np.sin(t)) assert_array_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
def create_true_data(type_of_data, number_of_modes, std, size, vocabulary_size): list_of_x_values, list_of_y_values = list(), list() if (type_of_data=="mixture_of_gaussians"): for i in range(number_of_modes): list_of_x_values.append(np.clip(np.random.normal(loc=np.random.randint(vocabulary_size-1), scale=500, size=size), 0, vocabulary_size)) list_of_y_values.append(np.clip(np.random.normal(loc=np.random.randint(vocabulary_size-1), scale=500, size=size), 0, vocabulary_size)) x = np.column_stack((np.append([], list_of_x_values), np.append([], list_of_y_values))) cos_theta = np.random.uniform() sin_theta = math.sqrt(1-cos_theta*cos_theta) if (type_of_data=="blobs"): x = np.clip(((vocabulary_size/20)*make_blobs(n_samples=size, centers=number_of_modes, cluster_std=std)[0]+(vocabulary_size/2)), [0,0], [vocabulary_size, vocabulary_size]).astype(int) if (type_of_data=="moons"): x = ((np.dot(make_moons(n_samples=size)[0]*(1/2), np.array([[cos_theta, sin_theta], [-sin_theta, cos_theta]])))*(vocabulary_size/2)+(vocabulary_size/2)).astype(int) if (type_of_data=="circles"): x = ((make_circles(n_samples=size)[0]*(vocabulary_size/2))+(vocabulary_size/2)).astype(int) if (type_of_data=="swiss_roll"): x = make_swiss_roll(n_samples=size, random_state=2, noise=std)[0] x = np.column_stack((x[:,0], x[:,2])) x = np.dot((1/25)*x,np.array([[cos_theta, -sin_theta], [sin_theta, cos_theta]])) x = (x*(vocabulary_size/2)+(vocabulary_size/2)).astype(int) if (type_of_data=="s_curve"): x = make_s_curve(n_samples=size)[0]/2 x = np.column_stack((x[:,0], x[:,2])) x = ((np.dot(x, np.array([[cos_theta, -sin_theta], [sin_theta, cos_theta]])))*(vocabulary_size/2)+(vocabulary_size/2)).astype(int) return x
def s_curves(request): """ Creates a random regressor fixture. """ X, y = make_s_curve(1000, random_state=888) # Set a class attribute for continuous data request.cls.s_curves = Dataset(X, y)
def test_shape(self): """ Check L has the correct shape""" N = 100 X, X_m = datasets.make_s_curve(N, random_state=0) Y, Y_m = datasets.make_s_curve(N, random_state=1) U_1 = utils.gaussian_similarity_kernel(X_m.reshape(N, 1), Y_m.reshape(N, 1), 1.0) U_2 = utils.gaussian_similarity_kernel(Y_m.reshape(N, 1), X_m.reshape(N, 1), 1.0) for d in [2, 3, 4]: L = laplacian_manifold_align([X, Y], [[None, U_1], [U_2, None]], 5, 0.1, 0.1, d=d) assert_true(L.shape == (N * 2, d))
def __init__(self, train=True, n_samples=6000, noise=0.05, test_fraction=0.1, seed=42): _rnd = np.random.RandomState(seed) data, pos = make_s_curve(n_samples, noise, _rnd) data = data.astype(np.float32) pos = pos.astype(np.float32) super().__init__(data, pos, train, test_fraction, _rnd)
def load_s_curve_hole(n_points=2000, deviation=0.1): """ Load a s curve dataset but with a hole in the middle """ x, color = make_s_curve(n_samples=n_points, noise=deviation) return x, color
def test_make_s_curve(): X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0) assert X.shape == (5, 3), "X shape mismatch" assert t.shape == (5, ), "t shape mismatch" assert_array_almost_equal(X[:, 0], np.sin(t)) assert_array_almost_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
def test_diff_size_graphs(self): N_X = 100 N_Y = 200 X, X_m = datasets.make_s_curve(N_X, random_state=0) Y, Y_m = datasets.make_s_curve(N_Y, random_state=1) U_1 = utils.gaussian_similarity_kernel(X_m.reshape(N_X, 1), Y_m.reshape(N_Y, 1), 1.0) U_2 = utils.gaussian_similarity_kernel(Y_m.reshape(N_Y, 1), X_m.reshape(N_X, 1), 1.0) for d in [2, 3, 4]: L = laplacian_manifold_align([X, Y], [[None, U_1], [U_2, None]], 5, 0.1, 0.1, d=d) assert_true(L.shape == (N_X + N_Y, d))
def test_curve(): X, y = datasets.make_s_curve(n_samples=128 * 2, noise=.05) run_ptsne_modes(X) for perplexity in (5, 30, 50, 100): run_comparison("S_Curve_{}".format(perplexity), X, color=y, perplexity=perplexity)
def generate_scurve(samples, noise, num_classes): X, t = make_s_curve(samples, noise=noise) spaces = np.linspace(t.min(), t.max(), num_classes + 1) y = np.empty(samples) for i in range(spaces.size - 1): idx = ((spaces[i] <= t) * (spaces[i] <= t)) y[idx] = i return X.astype(np.float32), y.astype(np.int32)
def test_manifold_algorithm_transform_fit(self, algorithm): """ Test manifold fit with algorithms having transform implemented """ X, y = make_s_curve(1000, random_state=94) with pytest.warns(YellowbrickWarning): manifold = Manifold(manifold=algorithm, target="auto") assert manifold.fit(X, y) is manifold, "fit did not return self"
def main(args): n_samples = args.get("n_samples", 2000) startTime = time.time() rng = np.random.RandomState(0) X, y = make_s_curve(n_samples=n_samples, random_state=rng) X = scipy.sparse.csr_matrix(X) X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng) pca = PCA(n_components=150, svd_solver='randomized', whiten=True).fit(X_train) return {'token': 'pca finished', 'startTime': int(round(startTime * 1000))}
def make_toy_Story(n_samples_per_class = 100, ood=False): mu1 = [.0, 1] mu2 = [.0, -1] cov = [[.05, 0], [0, .05]] X, color = make_s_curve(n_samples_per_class*3, random_state=0) s_labels = 0*(X[:,2]<-.5) + 1*(-.5<=X[:,2])*(X[:,2]<1) + 2*(X[:,2]>=1) n1 = np.random.multivariate_normal(mu1, cov, n_samples_per_class) n2 = np.random.multivariate_normal(mu2, cov, n_samples_per_class) X = np.concatenate([X[:,[0,2]], n1, n2]) Y = np.concatenate([s_labels, [3]*n_samples_per_class, [4]*n_samples_per_class]) p = np.random.permutation(len(X)) return X[p], Y[p]
def make_s_curve(n_samples, seed): x, y = datasets.make_s_curve(n_samples=n_samples, random_state=seed) idx = y.argsort() y.sort() x = x[idx] y = [] for i in range(n_samples): if i < n_samples / 2: y.append('purple') else: y.append('blue') return (x, y)
def main(args): n_samples = args.get("n_samples", 4000) startTime = time.time() rng = np.random.RandomState(0) X, y = make_s_curve(n_samples=n_samples, random_state=rng) X = scipy.sparse.csr_matrix(X) X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng) kmeans = KMeans(algorithm='elkan').fit(X_train) token = completeness_score(kmeans.predict(X_test), y_test) print(token) return {'token': token, 'startTime': int(round(startTime * 1000))}
def test_serialize(): X, y = datasets.make_s_curve(n_samples=64, noise=.05) ptsne1 = PTSNE(X, n_iter=10, batch_size=64) fd = io.BytesIO() ptsne1.save(fd) fd.seek(0) ptsne2 = PTSNE.load(fd) y1 = ptsne1.transform(X) y2 = ptsne2.transform(X) assert np.allclose(y1, y2)
def make_toy_Story_with_ood_class(n_samples_per_class = 100): mu1 = [.0, 1] mu2 = [.0, -1] cov = [[.05, 0], [0, .05]] X, color = make_s_curve(n_samples_per_class*3, random_state=0) s_labels = 0*(X[:,2]<-.5) + 1*(-.5<=X[:,2])*(X[:,2]<1) + 2*(X[:,2]>=1) n1 = np.random.multivariate_normal(mu1, cov, n_samples_per_class) n2 = np.random.multivariate_normal(mu2, cov, n_samples_per_class) u1 = np.random.uniform(0, 2*np.pi, [int(n_samples_per_class*5), 1]) ood_s = np.concatenate([2.2*np.cos(u1), 2.2*np.sin(u1)], axis=1) X = np.concatenate([X[:,[0,2]], n1, n2, ood_s]) Y = np.concatenate([s_labels, [3]*n_samples_per_class, [4]*n_samples_per_class, [5]*int(n_samples_per_class*5)]) p = np.random.permutation(len(X)) return X[p], Y[p]
def test(): n_points = 1000 X, color = datasets.make_s_curve(n_points, random_state=0) n_neighbors = 10 n_components = 2 # Create figure fig = plt.figure(figsize=(15, 8)) fig.suptitle("Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors), fontsize=14) # Add 3d scatter plot ax = fig.add_subplot(251, projection='3d') ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) ax.view_init(4, -72) # Set-up manifold methods LLE = partial(manifold.LocallyLinearEmbedding, n_neighbors, n_components, eigen_solver='auto') methods = OrderedDict() methods['LLE'] = LLE(method='standard') methods['LTSA'] = LLE(method='ltsa') methods['Hessian LLE'] = LLE(method='hessian') methods['Modified LLE'] = LLE(method='modified') methods['Isomap'] = manifold.Isomap(n_neighbors, n_components) methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1) methods['SE'] = manifold.SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors) methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', random_state=0) # Plot results for i, (label, method) in enumerate(methods.items()): t0 = time() Y = method.fit_transform(X) t1 = time() print("%s: %.2g sec" % (label, t1 - t0)) ax = fig.add_subplot(2, 5, 2 + i + (i > 3)) ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) ax.set_title("%s (%.2g sec)" % (label, t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) ax.axis('tight') plt.show()
def _generate(self, random_state): rows = api.payload['rows'] noise = api.payload['noise'] x, y = make_s_curve(random_state=random_state, n_samples=rows, noise=noise) if 'classes' in api.payload: classes = api.payload['classes'] if classes is not None and classes > 0: categories = pd.cut(y, classes) y = categories.codes return x, y
def gen_s_curve(rng, emissions): """Generate synthetic data from datasets generating process. """ N = 500 J = 100 D = 2 # Generate latent manifold. # ------------------------- X, t = make_s_curve(N, random_state=rng) X = np.delete(X, obj=1, axis=1) X = X / np.std(X, axis=0) inds = t.argsort() X = X[inds] t = t[inds] # Generate kernel `K` and latent GP-distributed maps `F`. # ------------------------------------------------------- K = kern.RBF(input_dim=D, lengthscale=1).K(X) F = rng.multivariate_normal(np.zeros(N), K, size=J).T # Generate emissions using `F` and/or `K`. # ---------------------------------------- if emissions == 'bernoulli': P = logistic(F) Y = rng.binomial(1, P).astype(np.double) return Dataset('s-curve', False, Y, X, F, K, None, t) if emissions == 'gaussian': Y = F + np.random.normal(0, scale=0.5, size=F.shape) return Dataset('s-curve', False, Y, X, F, K, None, t) elif emissions == 'multinomial': C = 100 pi = np.exp(F - logsumexp(F, axis=1)[:, None]) Y = np.zeros(pi.shape) for n in range(N): Y[n] = rng.multinomial(C, pi[n]) return Dataset('s-curve', False, Y, X, F, K, None, t) elif emissions == 'negbinom': P = logistic(F) R = np.arange(1, J + 1, dtype=float) Y = rng.negative_binomial(R, 1 - P) return Dataset('s-curve', False, Y, X, F, K, R, t) else: assert (emissions == 'poisson') theta = np.exp(F) Y = rng.poisson(theta) return Dataset('s-curve', False, Y, X, F, K, None, t)
def __init__(self, n_samples=SAMPLE, split='none', split_ratio=FIT_DEFAULT, random_state=SEED, data_path=DEFAULT_PATH): """Init. Args: n_samples(int, optional): Number of points to sample from the manifold. split(str, optional): Name of split. See BaseDataset. split_ratio(float, optional): Ratio of train split. See BaseDataset. random_state(int, optional): Random seed. See BaseDataset. data_path(str, optional): Unused. Only to share same signature with other datasets. """ x, y = datasets.make_s_curve(n_samples=n_samples, random_state=random_state) super().__init__(x, y, split, split_ratio, random_state) self.latents = self.targets.numpy()
def load_dataset(dataset, n_samples, random_state=1, n_features=3): # wrapper function to load one of the 3d datasets if dataset == 's_curve': return make_s_curve(n_samples, random_state=random_state) elif dataset == 'swiss_roll': return make_swiss_roll(n_samples, random_state=random_state) elif dataset == 'broken_swiss_roll': return make_broken_swiss_roll(n_samples, random_state=random_state) elif dataset == 'sphere': return make_sphere(n_samples, random_state=random_state) elif dataset == '3_circles': return make_3_circles(n_samples, random_state=random_state) elif dataset == 'peaks': return make_peaks(n_samples, random_state=random_state) elif dataset == 'blobs': return make_blobs(n_samples, n_features=n_features, centers=3, random_state=random_state) else: print("unknown dataset")
def test_isomap_fit_precomputed_radius_graph(): # Isomap.fit_transform must yield similar result when using # a precomputed distance matrix. X, y = datasets.make_s_curve(200, random_state=0) radius = 10 g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance") isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed") isomap.fit(g) precomputed_result = isomap.embedding_ isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski") result = isomap.fit_transform(X) assert_allclose(precomputed_result, result)
def genPoints(n_points=1000, func_name='swiss-roll'): if func_name == 'swiss-roll': points, colors = datasets.make_swiss_roll(n_points, random_state=0) elif func_name == 's-curve': points, colors = datasets.make_s_curve(n_points, random_state=0) elif func_name == 'severed-sphere': random_state = check_random_state(0) p = random_state.rand(n_points) * (2 * np.pi - 0.55) t = random_state.rand(n_points) * np.pi indices = ((t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8)))) colors = p[indices] points = np.c_[np.sin(t[indices]) * np.cos(p[indices]), np.sin(t[indices]) * np.sin(p[indices]), np.cos(t[indices])] else: raise ValueError('Unsupported function [%s]' % func_name) return points, colors
def test_transform(): n_samples = 200 n_components = 10 noise_scale = 0.01 # Create S-curve dataset X, y = datasets.make_s_curve(n_samples, random_state=0) # Compute isomap embedding iso = manifold.Isomap(n_components=n_components, n_neighbors=2) X_iso = iso.fit_transform(X) # Re-embed a noisy version of the points rng = np.random.RandomState(0) noise = noise_scale * rng.randn(*X.shape) X_iso2 = iso.transform(X + noise) # Make sure the rms error on re-embedding is comparable to noise_scale assert np.sqrt(np.mean((X_iso - X_iso2)**2)) < 2 * noise_scale
def test_parsimonious(): from UQpy.utilities.kernels.GaussianKernel import GaussianKernel from UQpy.dimension_reduction.diffusion_maps.DiffusionMaps import DiffusionMaps from sklearn.datasets import make_s_curve n = 4000 X, X_color = make_s_curve(n, random_state=3, noise=0) kernel = GaussianKernel() dmaps_object = DiffusionMaps(data=X, alpha=1.0, n_eigenvectors=9, is_sparse=True, n_neighbors=100, kernel=kernel) dmaps_object.parsimonious(dim=2) assert dmaps_object.parsimonious_indices[0] == 1 assert dmaps_object.parsimonious_indices[1] == 5
def makeSCurve(): n_points = 1000 noise = 0.2 X, color = datasets.make_s_curve(n_points, noise=noise, random_state=0) Y = np.array([X[:,0], X[:,2]]) Y = Y.T # Stretch in all directions Y = Y * 2 # Now add some background noise xMin = np.min(Y[:,0]) xMax = np.max(Y[:,0]) yMin = np.min(Y[:,1]) yMax = np.max(Y[:,1]) n_bg = n_points//10 Ybg = np.zeros(shape=(n_bg,2)) Ybg[:,0] = np.random.uniform(low=xMin, high=xMax, size=n_bg) Ybg[:,1] = np.random.uniform(low=yMin, high=yMax, size=n_bg) Y = np.concatenate((Y, Ybg)) return Y
wcss.append(kmeans.inertia_) plt.scatter(range(1, 20), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() plt.clf() """## Get the dataset""" n = 1000 from sklearn.datasets import make_moons, make_blobs, make_circles, make_s_curve X_moons, y_moons = make_moons(n_samples = n, noise=0.1) X_blobs, y_blobs = make_blobs(n_samples = n, n_features = 2) X_circles, y_circles = make_circles(n_samples=n, noise=0.1, factor = 0.5) X_scurve, y_scurve = make_s_curve(n_samples=n, noise = 0.1) X_random = np.random.random([n, 2]) transformation = [[0.80834549, -0.83667341], [-0.20887718, 0.85253229]] X_aniso = np.dot(X_blobs, transformation) plot_dataset(X_moons) visual_elbow(X_moons) visual(10, X_moons) plot_dataset(X_blobs) visual_elbow(X_blobs) visual(3, X_blobs)
'--random_state', type=int, help= ' RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.' ) import scrape as sc sc.all_options(parser) sc.output_options(parser) from scrape import write_dict args = sc.parse_args(parser) np.random.seed(0) X, y = make_s_curve(n_samples=args.n_samples, noise=args.noise, random_state=args.random_state) datasets.dump_svmlight_file(X, y, args.output_file, zero_based=args.zero_based, query_id=args.query_id, multilabel=args.multilabel, comment=args.comment) write_dict({'feature_file': args.output_file})
# the target here is just the digit represented by the data print(digits.target) # we've got two versions of the data array, data and images print(digits.data.shape) print(digits.images.shape) # we see that the two versions differ only in shape print(digits.data.__array_interface__["data"]) print(digits.images.__array_interface__["data"]) # we can visualise this. fig = plt_2.figure(figsize=(6, 6)) # figure size in inches fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) # plot the digits, each is 8x8 pixels for i in range(64): ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[]) ax.imshow(digits.images[i], cmap=plt_2.cm.binary, interpolation="nearest") # label the image with the target value ax.text(0, 7, str(digits.target[i])) # each feature is a real-valued quantity indicating the darkness of a particular # pixel in an 8x8 image print("Non linear dataset - The S-Curve: ") data, colors = make_s_curve(n_samples=1000) print(data.shape) print(colors.shape) # let's visualise this ax = plt_2.axes(projection="3d") ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=colors) ax.view_init(10, -60)
pca = decomposition.PCA(n_components=2).fit(X) X_pca = pca.transform(X) plot_embedding(X_pca, "Principal Components projection of the digits") plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray") plt.axis('off') plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray") plt.axis('off') plt.show() """ MANIFOLD LEARNING PCA has one weakness which is it cannot detect non-linear features. Then the manifold learning algorithms have been developed to bypass this deficiency. In manifold learning, we use a canonical dataset called the S-curve. """ from sklearn.datasets import make_s_curve X,y=make_s_curve(n_samples=1000) from mpl_toolkits.mplot3d import Axes3D ax=plt.axes(projection='3d') ax.scatter3D(X[:,0],X[:,1],X[:,2],c=y) ax.view_init(10,-60) # this is a 2D dataset embedded in 3D, but it is embedded in such a way that #PCA can't discover the underlying data orientation. from sklearn import decomposition X_pca=decomposition.PCA(n_components=2).fit_transform(X) plt.scatter(X_pca[:,0],X_pca[:,1],c=y) #Manifold learning algorithms, however, available in the sklearn.manifold #submodule, are able to recover the underlying 2-dimensional manifold: from sklearn.manifold import Isomap iso = Isomap(n_neighbors=15, n_components=2) X_iso = iso.fit_transform(X) plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y)
def make_sklearn_dataset(dataset_name, n_samples): # create dataset if 'circles_distant' == dataset_name: # labels=3, seed=1, n-samples=1000, max-depth=4 OR labels=4, seed=1, n-samples=1000, max-depth=4 dataset = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) elif 'moons' == dataset_name: # labels=2, seed=13, n-samples=500, max-depth=4 OR labels=1, seed=27, n-samples=500, max-depth=4 dataset = datasets.make_moons(n_samples=n_samples, noise=.05) elif 'blobs' == dataset_name: # labels=1, seed=0, n-samples=100, max-depth=3 dataset = datasets.make_blobs(n_samples=n_samples, random_state=8) elif 'circles_near' == dataset_name: # labels = 20, seed=0, n-samples=2000, max-depth=5 dataset = datasets.make_circles(n_samples=n_samples, noise=.05) elif 's_curve' == dataset_name: # labels=10, seed=35, n-samples=2500, max-depth=7 scurve1 = datasets.make_s_curve(n_samples=n_samples // 2, noise=.05) scurve1 = np.vstack((scurve1[0][:, 0], scurve1[0][:, 2])).T scurve2 = datasets.make_s_curve(n_samples=n_samples // 2, noise=.05) scurve2 = np.vstack( (scurve2[0][:, 0], scurve2[0][:, 2])).T + [.5, .5] # offset dataset = np.concatenate((scurve1, scurve2), 0), \ np.concatenate((np.asarray([0] * scurve1.shape[0]), np.asarray([1] * scurve2.shape[0])), 0) elif 'swiss_roll' == dataset_name: # labels = 10, seed = 35, n-samples=2500, max-depth=5 sroll1 = datasets.make_swiss_roll(n_samples=n_samples // 2, noise=.05) sroll1 = np.vstack((sroll1[0][:, 0], sroll1[0][:, 2])).T sroll2 = datasets.make_swiss_roll(n_samples=n_samples // 2, noise=.05) sroll2 = np.vstack( (sroll2[0][:, 0], sroll2[0][:, 2])).T * 0.75 # shrink dataset = np.concatenate((sroll1, sroll2), 0), \ np.concatenate((np.asarray([0] * sroll1.shape[0]), np.asarray([1] * sroll2.shape[0])), 0) return dataset