def test_pickle(self): # This test is not really useful anymore because primitive now does not keep random state # anymore but outputs depend only on inputs, and not on previous calls to "produce" method. hyperparams_class = RandomPrimitive.metadata.get_hyperparams() primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) inputs = container.List(list(range(4)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641]).reshape(4, 1))) pickled_primitive = pickle.dumps(primitive) inputs = container.List(list(range(4, 8)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1))) unpickled_primitive = pickle.loads(pickled_primitive) call_metadata = self.call_primitive(unpickled_primitive, 'produce', inputs=inputs) self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1)))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: np.random.seed(1234) G = inputs[0].copy() try: link_predicton = inputs[3] if type(link_predicton) is not bool: link_predicton = False except: link_predicton = False if link_predicton: g = np.array(G.copy()) else: g = graspyPTR(G) n = g.shape[0] max_dimension = self.hyperparams['max_dimension'] if max_dimension > n: max_dimension = n n_elbows = self.hyperparams['which_elbow'] if self.hyperparams['use_attributes']: adj = [g] MORE_ATTR = True attr_number = 1 while MORE_ATTR: attr = 'attr' temp_attr = np.array(list(networkx.get_node_attributes(G, 'attr' + str(attr_number)).values())) if len(temp_attr) == 0: MORE_ATTR = False else: K = np.sum((temp_attr[:, np.newaxis][:, np.newaxis, :] - temp_attr[:, np.newaxis][np.newaxis, :, :])**2, axis = -1) adj.append(graspyPTR(K)) attr_number += 1 M = len(adj) if M > 1: omni_object = graspyOMNI(n_components = max_dimension, n_elbows = n_elbows) X_hats = omni_object.fit_transform(adj) X_hat = np.mean(X_hats, axis = 0) embedding = X_hat.copy() inputs[0] = container.ndarray(embedding) return base.CallResult(inputs) ase_object = graspyASE(n_components=max_dimension, n_elbows = n_elbows) X_hat = ase_object.fit_transform(g) inputs[0] = container.ndarray(X_hat) return base.CallResult(inputs)
def test_regularization(self): # Generate data, well-posed problem with nSamples > nFeatures np.random.seed(0) nSamples = 10 nFeatures = 5 true_coef, inputs, outputs = generate_linear_data(nSamples, nFeatures) # Test fitting with default hyperparams hp = OWLHyperparams(OWLHyperparams.defaults()) primitive = OWLRegression(hyperparams=hp) primitive.set_training_data(inputs=inputs, outputs=outputs) primitive.fit() ps = primitive.get_params() self.assertEqual(np.all(ps['coef'] == primitive._coef), True) self.assertEqual(ps['intercept'] == primitive._intercept, True) self.assertEqual(ps['fitted'], True) relative_error = np.linalg.norm(ps['coef'] - true_coef) / np.linalg.norm(true_coef) #print("relative_error = {}".format(relative_error)) self.assertEqual(relative_error < 0.2, True) self.assertEqual(np.abs(ps['intercept']) < 0.1, True) # Test fitting with customized hyperparams: OSCAR hp = OWLHyperparams(OWLHyperparams.defaults(), weight_type='linear', weight_max_val=0.01, weight_max_off=0, weight_min_val=0.005, weight_min_off=nFeatures - 1, learning_rate=0.001) primitive = OWLRegression(hyperparams=hp) primitive.set_training_data(inputs=inputs, outputs=outputs) primitive.fit() ps = primitive.get_params() self.assertEqual(np.all(ps['coef'] == primitive._coef), True) self.assertEqual(ps['intercept'] == primitive._intercept, True) self.assertEqual(ps['fitted'], True) relative_error = np.linalg.norm(ps['coef'] - true_coef) / np.linalg.norm(true_coef) #print("relative_error = {}".format(relative_error)) self.assertEqual(relative_error < 0.2, True) self.assertEqual(np.abs(ps['intercept']) < 0.1, True) # Test single / multiple produce inputs_produce = container.ndarray(np.random.randn(1, nFeatures)) outputs_produce = primitive.produce(inputs=inputs_produce).value self.assertEqual(outputs_produce.shape, (1, )) inputs_produce = container.ndarray(np.random.randn(2, nFeatures)) outputs_produce = primitive.produce(inputs=inputs_produce).value self.assertEqual(outputs_produce.shape, (2, ))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Input G: an n x n matrix or a networkx Graph Return The largest connected component of g """ G = inputs['0'] csv = inputs['learningData'] #if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0: # nx.set_node_attributes(G,'nodeID',-1) # for i in range(len(G)): # G.node[i]['nodeID'] = i if len(csv) != 0: if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0: nx.set_node_attributes(G, 'nodeID', -1) for i in range(len(G)): G.node[i]['nodeID'] = i nodeIDs = list(nx.get_node_attributes(G, 'nodeID').values()) nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs])) return base.CallResult(container.List([G.copy(), nodeIDs, csv])) if type(G) == np.ndarray: if G.ndim == 2: if G.shape[0] == G.shape[1]: # n x n matrix G = Graph(G) else: raise TypeError( "Networkx graphs or n x n numpy arrays only") subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)] G_connected = [[0]] for i in subgraphs: if len(i) > len(G_connected[0]): G_connected = [i] nodeIDs = list( nx.get_node_attributes(G_connected[0], 'nodeID').values()) nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs])) return base.CallResult( container.List([G_connected[0].copy(), nodeIDs, csv]))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: _X = inputs.T d, numVectors = _X.shape Uhat = self._U self._grastaOPTIONS.subsampling = self._subsampling Lhat = np.zeros(_X.shape) for i in range(0, numVectors): _x = _X[:, i] if (self._grastaOPTIONS.subsampling < 1): _xidx = self._random_state.choice( self._dim, int(np.ceil(self._grastaOPTIONS.subsampling * self._dim)), replace=False) else: _xidx = np.where(~np.isnan(_x))[0] U, w, s, STATUS_new, admm_OPTS = self._grasta_stream( Uhat, _x, _xidx) Lhat[:, i] = U @ w return base.CallResult( container.ndarray(Lhat.T, generate_metadata=True))
def test_ndarray(self): with self.assertLogs(SumPrimitive.metadata.query()['python_path'], level='DEBUG') as cm: hyperparams_class = SumPrimitive.metadata.get_hyperparams() primitive = SumPrimitive( hyperparams=hyperparams_class.defaults(), docker_containers=self.get_docker_containers()) inputs = container.ndarray([[1, 2, 3, 4], [5, 6, 7, 8]], generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) # Because it is a singleton produce method we can know that there is exactly one value in outputs. result = call_metadata.value[0] self.assertEqual(result, 36) self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) self.assertEqual( call_metadata.value.metadata.query( (metadata_base.ALL_ELEMENTS, ))['structural_type'], float) self.assertEqual(len(cm.records), 2) self.assertEqual(cm.records[0].name, SumPrimitive.metadata.query()['python_path']) self.assertEqual(cm.records[1].name, SumPrimitive.metadata.query()['python_path']) self.assertIsInstance(cm.records[0].data, numpy.ndarray) self.assertEqual(cm.records[1].response.status, 200)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Pass to ranks **Positional Arguments:** inputs: - JHUGraph adjacency matrix """ path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "ptr.interface.R") cmd = """ source("%s") fn <- function(inputs) { ptr.interface(inputs) } """ % path #print(cmd) result = robjects.r(cmd)(inputs) #print(result) outputs = container.ndarray(result) return base.CallResult(outputs)
class Hyperparams(hyperparams.Hyperparams): n_components = hyperparams.Hyperparameter[typing.Optional[int]]( default=None, description= 'Number of components (< n_classes - 1) for dimensionality reduction.', semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ], ) learning_rate = hyperparams.Uniform( lower=0.01, upper=2, default=0.1, description= 'Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter', 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', ], ) array1 = hyperparams.Hyperparameter[container.ndarray]( default=container.ndarray(numpy.array([[1, 2], [3, 4]]), generate_metadata=True), semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ], ) array2 = hyperparams.Hyperparameter[container.DataFrame]( default=container.DataFrame([[1, 2], [3, 4]], generate_metadata=True), semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ], )
def test_basic(self): hyperparams_class = RandomPrimitive.metadata.get_hyperparams() primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) inputs = container.List(list(range(4)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertTrue( numpy.allclose( call_metadata.value.values, container.ndarray([ 0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641 ]).reshape((4, 1)))) self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) self.assertEqual( call_metadata.value.metadata.query( (base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return base.CallResult(None) xhat = self._inputs_1 yhat = self._inputs_2 seeds = self._reference['match'].astype(int).astype(bool) xhat_seed_names = self._reference[self._reference.columns[1]][seeds].values yhat_seed_names = self._reference[self._reference.columns[2]][seeds].values n_seeds = len(xhat_seed_names) x_seeds = np.zeros(n_seeds) y_seeds = np.zeros(n_seeds) for i in range(n_seeds): x_seeds[i] = np.where(xhat[xhat.columns[0]] == xhat_seed_names[i])[0][0] y_seeds[i] = np.where(yhat[yhat.columns[0]] == yhat_seed_names[i])[0][0] # do this more carefully TODO xhat_embedding = xhat.values[:,1:].astype(np.float32) yhat_embedding = yhat.values[:,1:].astype(np.float32) S_xx = np.exp(-cdist(xhat_embedding, xhat_embedding, )) S_yy = np.exp(-cdist(yhat_embedding, yhat_embedding, )) gmp = GraphMatch(shuffle_input=False) match = gmp.fit_predict(S_xx, S_yy, x_seeds, y_seeds) self._match = container.ndarray(match) self._fitted = True return CallResult(None)
def _read_fileuri(self, fileuri: str) -> container.ndarray: """ @see https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/video_reader.py#L65 :param fileuri: :return: """ capture = cv2.VideoCapture(fileuri) frames = [] try: while capture.isOpened(): ret, frame = capture.read() if not ret: break else: assert frame.dtype == np.uint8, frame.dtype if frame.ndim == 2: # Make sure there are always three dimensions. frame = frame.reshape(list(frame.shape) + [1]) assert frame.ndim == 3, frame.ndim frames.append(frame) finally: capture.release() return container.ndarray(np.array(frames), generate_metadata=False)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Apply neural network-based feature extraction to image_tensor""" self._lazy_init() image_tensor = inputs[1] image_d3mIndex = inputs[0] if not len(image_tensor.shape) == 4: raise ValueError('Expect shape to have 4 dimension') resized = False if self._resize_data: if not (image_tensor.shape[1] == 244 and image_tensor.shape[2] == 244): resized = True y = np.empty((image_tensor.shape[0], 224, 224, 3)) for index in range(image_tensor.shape[0]): y[index] = imresize(image_tensor[index], (224, 224)) image_tensor = y # preprocess() modifies the data. For now just copy the data. if self._preprocess_data: if resized: # Okay to modify image_tensor, since its not input data = image_tensor else: data = image_tensor.copy() self._preprocess(data) else: data = image_tensor # BUG fix: add global variable to fix ta3 system if calling multiple times of this primitive with self._graph.as_default(): output_ndarray = self._model.predict(data) output_ndarray = output_ndarray.reshape(output_ndarray.shape[0], -1) output_dataFrame = container.DataFrame( container.ndarray(output_ndarray)) # if generate_metadata is true, update the metadata if self.hyperparams["generate_metadata"]: for each_column in range(output_ndarray.shape[1]): metadata_selector = (mbase.ALL_ELEMENTS, each_column) metadata_each_column = { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularColumn', 'https://metadata.datadrivendiscovery.org/types/Attribute' ) } output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_each_column, selector=metadata_selector) # update the original index to be d3mIndex output_dataFrame = output_dataFrame.set_index(image_d3mIndex) self._has_finished = True self._iterations_done = True return CallResult(output_dataFrame, self._has_finished, self._iterations_done)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: #make_keras_pickleable() produce_data, learning_df, nodes_df, edges_df = self._parse_inputs( inputs, return_all=True) if self.fitted: result = self._sdne._Y #produce( )#_Y else: dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) produce_data = networkx.from_scipy_sparse_matrix(produce_data) self._sdne.learn_embedding(graph=produce_data) self._model = self._sdne._model result = self._sdne._Y target_types = [ 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ] if self.hyperparams['return_list']: result_np = container.ndarray(result, generate_metadata=True) return_list = d3m_List([result_np, inputs[1], inputs[2]], generate_metadata=True) return CallResult(return_list, True, 1) else: learn_df = d3m_DataFrame(learning_df, generate_metadata=True) learn_df = get_columns_not_of_type(learn_df, target_types) learn_df = learn_df.remove_columns( [learn_df.columns.get_loc('nodeID')]) #learn_df = learn_df.drop('nodeID', axis = 'columns') result_df = d3m_DataFrame(result, generate_metadata=True) result_df = result_df.loc[result_df.index.isin( learning_df['d3mIndex'].values)] for column_index in range(result_df.shape[1]): col_dict = dict( result_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) col_dict['name'] = str(learn_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') result_df.metadata = result_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) result_df.index = learn_df.index.copy() output = utils.append_columns(learn_df, result_df) #output.set_index('d3mIndex', inplace=True) return CallResult(output, True, 1)
def produce_subspace(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: X = inputs U = self._U.copy() return base.CallResult(container.ndarray(U, generate_metadata=True))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: np.random.seed(1234) G = inputs[0].copy() g = graspyPTR(G) n = g.shape[0] max_dimension = self.hyperparams['max_dimension'] if max_dimension > n: max_dimension = n n_elbows = self.hyperparams['which_elbow'] """ What does Omni(DAD) even look like? if self.hyperparams['use_attributes']: adj = [g] MORE_ATTR = True attr_number = 1 while MORE_ATTR: attr = 'attr' temp_attr = np.array(list(networkx.get_node_attributes(G, 'attr' + str(attr_number)).values())) if len(temp_attr) == 0: MORE_ATTR = False else: K = np.sum((temp_attr[:, np.newaxis][:, np.newaxis, :] - temp_attr[:, np.newaxis][np.newaxis, :, :])**2, axis = -1) adj.append(graspyPTR(K)) attr_number += 1 M = len(adj) if M > 1: g = self._omni(adj) lse_object = graspyLSE(n_components = max_dimension, n_elbows=n_elbows) X_hats = lse_object.fit_transform(g) d = X_hats.shape[1] X_hats_reshaped = X_hats.reshape((M, n, d)) X_hat = np.mean(X_hats_reshaped, axis = 0) embedding = X_hat.copy() inputs[0] = container.ndarray(embedding) return base.CallResult(inputs) """ lse_object = graspyLSE(n_components = max_dimension, n_elbows=n_elbows) X_hat = lse_object.fit_transform(g) inputs[0] = container.ndarray(X_hat) return base.CallResult(inputs)
def produce_sparse(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: Lhat = self.produce(inputs=inputs).value Shat = inputs - Lhat return base.CallResult(container.ndarray(Shat, generate_metadata=True))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # if self._training_data is None or self._y_dim==0: inputs_timeseries = inputs[1] inputs_d3mIndex = inputs[0] if not self._fitted: return CallResult(None, True, 0) if isinstance(inputs_timeseries, np.ndarray): X = np.zeros((inputs_timeseries.shape[0], self._y_dim)) else: X = np.zeros((len(inputs_timeseries), self._y_dim)) for i, series in enumerate(inputs_timeseries): if series.shape[1] > 1 and not self._value_found: series_output = pd.DataFrame() for j in range(series.shape[1]): series_output = pd.concat( [series_output, series.iloc[:, j]]) else: series_output = series if (series_output.shape[0] < self._y_dim): # pad with zeros X[i, :series_output. shape[0]] = series_output.iloc[:series_output.shape[0], self._value_dimension] else: # Truncate or just fit in X[i, :] = series_output.iloc[:self._y_dim, self._value_dimension] # save the result to DataFrame format output_ndarray = self._model.transform(X) output_dataFrame = container.DataFrame( container.ndarray(output_ndarray)) if self.hyperparams["generate_metadata"]: # add metadata if required for each_column in range(output_ndarray.shape[1]): metadata_selector = (mbase.ALL_ELEMENTS, each_column) metadata_each_column = { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularColumn', 'https://metadata.datadrivendiscovery.org/types/Attribute' ) } output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_each_column, selector=metadata_selector) # update the original index to be d3mIndex output_dataFrame = output_dataFrame.set_index(inputs_d3mIndex) return CallResult(output_dataFrame, True, 1)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Perform Out of Sample Adjacency Spectral Embedding on a graph. """ np.random.seed(1234) g = inputs[0].copy() if type(g) == networkx.classes.graph.Graph: g = networkx.to_numpy_array(g) n = g.shape[0] D = np.linalg.pinv(np.diag(g.sum(axis=1))**(1/2)) L = D @ g @ D if self.hyperparams['max_dimension'] >= n: self.hyperparams['max_dimension'] = n - 1 d_max = self.hyperparams['max_dimension'] in_sample_n = self.hyperparams['n_in_sample'] if in_sample_n > n: in_sample_n = n # TODO ASE HERE in_sample_idx = np.random.choice(n, in_sample_n) out_sample_idx = np.setdiff1d(list(range(n)),in_sample_idx) in_sample_A = L[np.ix_(in_sample_idx, in_sample_idx)] out_sample_A = L[np.ix_(out_sample_idx, in_sample_idx)] # hp_ase = ase_hyperparameters({'max_dimension': dim, 'use_attributes': False, 'which_elbow': self.hyperparams['which_elbow']}) # ASE = ase(hyperparams = hp_ase) # embedding = ASE.produce(inputs = [g]).value[0] tsvd = TruncatedSVD(n_components = d_max) tsvd.fit(in_sample_A) eig_vectors = tsvd.components_.T eig_values = tsvd.singular_values_ elbow = self._profile_likelihood_maximization(eig_values, self.hyperparams['which_elbow'])[-1] eig_vectors = eig_vectors[:, :elbow + 1].copy() eig_values = eig_values[:elbow + 1].copy() d = len(eig_values) in_sample_embedding = eig_vectors.dot(np.diag(eig_values**0.5)) out_sample_embedding = out_sample_A @ eig_vectors @ np.diag(1/np.sqrt(eig_values)) embedding = np.zeros((n,d)) embedding[in_sample_idx] = in_sample_embedding embedding[out_sample_idx] = out_sample_embedding inputs[0] = container.ndarray(embedding) return base.CallResult(inputs)
def generate_linear_data(nSamples, nFeatures): """ y = X * coef + noise noise = 0, for simplicity of unittest """ # design matrix X = np.random.randn(nSamples, nFeatures) X = X - np.mean(X, 0) # centered X = X / np.linalg.norm(X, ord=2, axis=0) # normalized # noise with variance 0.01 #noise = np.random.randn(nSamples) * 0.1 noise = np.zeros(nSamples) # coefficients coef = np.random.randn(nFeatures) y = X.dot(coef) + noise return coef, container.ndarray( X, generate_metadata=True), container.ndarray(y, generate_metadata=True)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: graph = inputs['0'] csv = inputs['1'] linktypes = np.array(csv['linkType'], dtype='int32') uniq_linktypes, n_i = np.unique(linktypes, return_counts=True) n_linktypes = len(uniq_linktypes) sources = np.array(csv['source_nodeID'], dtype='int32') targets = np.array(csv['target_nodeID'], dtype='int32') nodes = set(np.concatenate((sources, targets))) n_nodes = len(nodes) info = np.array(csv['linkExists'], dtype='int32') n_info = len(info) edge_counts = np.zeros(n_linktypes) for i in range(n_info): temp_link_type = linktypes[i] edge_counts[temp_link_type] += info[i] p_hats = edge_counts / n_i graphs = [ p_hats[i] * np.ones(shape=(n_nodes, n_nodes)) for i in range(n_linktypes) ] # set up a bunch of empty graphs for i in range(n_info): temp_link_type = int(linktypes[i]) graphs[temp_link_type][sources[i], targets[i]] = info[i] graphs[temp_link_type][targets[i], sources[i]] = info[i] big_graph = np.zeros(shape=(n_nodes * int(n_linktypes), n_nodes * int(n_linktypes))) for i in range(n_linktypes): big_graph[i * n_nodes:(i + 1) * n_nodes, i * n_nodes:(i + 1) * n_nodes] = graphs[i] for i in range(n_linktypes): for j in range(i + 1, n_linktypes): big_graph[i * n_nodes:(i + 1) * n_nodes, j * n_nodes:(j + 1) * n_nodes] = (graphs[i] + graphs[j]) / 2 big_graph[j * n_nodes:(j + 1) * n_nodes, i * n_nodes:(i + 1) * n_nodes] = (graphs[i] + graphs[j]) / 2 return base.CallResult(container.List([container.ndarray(big_graph)]))
def _load_image_group(self, uris: List[str], bands: List[str], base_uri: str, max_dimension: int) -> container.ndarray: zipped = zip(bands, uris) images = list( map(lambda image: self._load_image(image[0], image[1], base_uri), zipped)) # reshape images (upsample) to have it all fit within an array if self.hyperparams["compress_data"]: # Store a header consisting of the dtype character and the data shape as unsigned integers. # Given c struct alignment, will occupy 16 bytes (1 + 4 + 4 + 4 + 3 padding) output_bytes = bytearray( struct.pack( "cIII", bytes(images[0][1].dtype.char.encode()), len(images), max_dimension, max_dimension, )) for band, image in images: output_bytes.extend( self._bilinear_resample(image, max_dimension).tobytes()) output_compressed_bytes = lz4.frame.compress(bytes(output_bytes)) output = np.frombuffer( output_compressed_bytes, dtype="uint8", count=len(output_compressed_bytes), ) else: output = np.ndarray(( len(DataFrameSatelliteImageLoaderPrimitive._BAND_ORDER), max_dimension, max_dimension, )) for band, image in images: band_idx = DataFrameSatelliteImageLoaderPrimitive._BAND_ORDER[ self._normalized_band_id(band)] output[band_idx] = self._bilinear_resample( image, max_dimension) output = container.ndarray( output, { "schema": metadata_base.CONTAINER_SCHEMA_VERSION, "structural_type": container.ndarray, }, generate_metadata=True, ) return output
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # def embed(self, *, g : JHUGraph, dim: int): """ Perform Laplacian Spectral Embedding on a graph TODO: YP description **Positional Arguments:** g: - Graph in JHUGraph format **Optional Arguments:** dim: - The number of dimensions in which to embed the data """ max_dimension = self.hyperparams['max_dimension'] path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "lse.interface.R") cmd = """ source("%s") fn <- function(inputs, max_dimension) { lse.interface(inputs, max_dimension) } """ % path #print(cmd) result = robjects.r(cmd)(inputs, max_dimension) vectors = container.ndarray(result[0]) eig_values = container.ndarray(result[1]) return base.CallResult([vectors, eig_values])
def test_columns_sum(self): dataframe = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, generate_metadata=True) dataframe_sum = utils.columns_sum(dataframe) self.assertEqual(dataframe_sum.values.tolist(), [[6, 15]]) self.assertEqual(dataframe_sum.metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'a') self.assertEqual(dataframe_sum.metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'b') array = container.ndarray(dataframe, generate_metadata=True) array_sum = utils.columns_sum(array) self.assertEqual(array_sum.tolist(), [[6, 15]]) self.assertEqual(array_sum.metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'a') self.assertEqual(array_sum.metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'b')
def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> container.ndarray: image_array = container.ndarray(numpy.array( [[fileuri.split('/')[-1]]], dtype=object), { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': container.ndarray, }, generate_metadata=False) image_array.metadata = image_array.metadata.update((), { 'image_reader_metadata': { 'foobar': 42, }, }) return image_array
def test_ndarray(self): array = container.ndarray(numpy.array([1, 2, 3], dtype=numpy.int64), generate_metadata=True) self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.numpy.ndarray', 'dimension': { 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }])
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Non-parametric clustering **Positional Arguments:** xhat1: - A numpy.ndarray type "matrix" xhat2: - A numpy.ndarray type "matrix" **Optional Arguments:** sigma: - a sigma for the Gaussian kernel """ #xhat1 = inputs[0,:,:] #xhat2 = inputs[1,:,:] xhat1 = inputs[0] xhat2 = inputs[1] sigma = self.hyperparams['sigma'] path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "nonpar.interface.R") cmd = """ source("%s") fn <- function(xhat1, xhat2, sigma) { nonpar.interface(xhat1, xhat2, sigma) } """ % path result = np.array(robjects.r(cmd)(xhat1, xhat2, sigma)) outputs = container.ndarray(result) return base.CallResult(outputs)
def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: if self._fitted: return base.CallResult(None) embeddings = self._training_inputs[1][0] csv = self._training_inputs[0] n_nodes, n_links = self._training_inputs[3] n_info = csv.shape[0] ranks = [[[], []] for i in range(n_links + 1)] try: int(np.array(csv['linkType'])[0]) except: csv['linkType'] = np.zeros(n_info) # print(csv, file=sys.stderr) csv_headers = csv.columns for header in csv_headers: if header[:6] == "source": SOURCE = header elif header[:6] == "target": TARGET = header for i in range(n_info): temp_link = int(np.array(csv['linkType'])[i]) temp_exists = int(np.array(csv['linkExists'])[i]) temp_source = int(np.array(csv[SOURCE])[i]) temp_target = int(np.array(csv[TARGET])[i]) temp_dot = embeddings[temp_link*n_nodes + temp_source - 1] @ embeddings[temp_link*n_nodes + temp_target - 1] ranks[temp_link][temp_exists].append(temp_dot) ranks[-1][temp_exists].append(temp_dot) for i in range(len(ranks)): ranks[i][0] = np.sort(ranks[i][0]) ranks[i][1] = np.sort(ranks[i][1]) self._embeddings = container.ndarray(embeddings) self._inner_products = container.List(ranks) self._fitted = True return base.CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Compute the predictions given inputs with shape n by m, yielding an array of size n. Inputs must match the dimensionality of the training data. """ # First do assorted error checking and initialization if self._fitted is False: raise ValueError("Calling produce before fitting.") if(inputs.shape[1] != self._coef.shape[0]): raise ValueError('Input dimension is wrong.') # Start timeout counter. outputs: container.ndarray = container.ndarray( inputs.dot(self._coef) + self._intercept) outputs.metadata = inputs.metadata.clear(for_value=outputs, source=self) return base.CallResult(outputs)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return base.CallResult(None) xhat = self._inputs_1 yhat = self._inputs_2 temp_train = self._reference.merge(xhat, how='left', on='e_nodeID') temp_train = temp_train.merge(yhat, how='left', on='g_nodeID') temp_train = temp_train[temp_train['match'].astype(int).astype(bool)] xhat_train = temp_train.values[:, 4:-300].astype(np.float32) yhat_train = temp_train.values[:, -300:].astype(np.float32) w, _ = orthogonal_procrustes(yhat_train, xhat_train) self._w = container.ndarray(w) self._fitted = True return CallResult(None)
def setup(self): self.large_dataframe = container.DataFrame(pandas.DataFrame( {str(i): [str(j) for j in range(10000)] for i in range(50)}, columns=[str(i) for i in range(50)]), generate_metadata=True) self.large_list = container.List([ container.List([str(j) for i in range(50)]) for j in range(10000) ], generate_metadata=True) self.large_ndarray = container.ndarray(numpy.array( [[[str(k) for k in range(5)] for i in range(10)] for j in range(10000)], dtype=object), generate_metadata=True) self.large_dict_list = container.List( {str(i): {str(j): j for j in range(10000)} for i in range(50)}, generate_metadata=True)