class RawModel: def __init__(self): # 2015-05-15 GEL Found that n_components=20 gives a nice balance of # speed (substantial improvement), accuracy, and reduced memory usage # (25% decrease). self.decomposer = TruncatedSVD(n_components=20) # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than # algorithm='kd_tree' # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples) # euclidean 0.950025 # manhattan 0.933533 # chebyshev 0.675662 # hamming 0.708646 # canberra 0.934033 # braycurtis 0.940530 self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean') def fit(self, trainExamples): X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) ) Y = [x.Y for x in trainExamples] self.model.fit(X, Y) return self def predict(self, examples): X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) ) return self.model.predict( X )
class TruncatedSVDImpl(): def __init__(self, n_components=2, algorithm='randomized', n_iter=5, random_state=None, tol=0.0): self._hyperparams = { 'n_components': n_components, 'algorithm': algorithm, 'n_iter': n_iter, 'random_state': random_state, 'tol': tol } self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def reduce_dimensionality(dataframe, maxvariance, columns_to_drop): ''' Performs PCA on feature pandas dataframe and reduces number of principal components to those which explain a defined variance ''' dataframe_without_columns = dataframe.drop(columns_to_drop, axis=1) LOGGER.info('Columns to be used by pca:') print dataframe_without_columns.columns LOGGER.info('Adding noise to dataframe') dataframe_without_columns = dataframe_without_columns + numpy.random.normal( size=dataframe_without_columns.shape) * 1.e-19 LOGGER.info('Starting PCA') try: pca = PCA(n_components='mle') pca.fit(dataframe_without_columns) # transform samples = pca.transform(dataframe_without_columns) # aggregated sum of variances sum_variance = sum(pca.explained_variance_) list_variance = pca.explained_variance_ #print sum_variance, pca.explained_variance_ # get those having aggregated variance below threshold except ValueError: LOGGER.info('PCA failed, using truncated SVD') svd = TruncatedSVD(n_components=3) svd.fit(dataframe_without_columns) samples = svd.transform(dataframe_without_columns) sum_variance = sum(svd.explained_variance_) list_variance = svd.explained_variance_ scomp = 0 ncomp = 0 while scomp < maxvariance: #c = pca.explained_variance_[ncomp] c = list_variance[ncomp] scomp = scomp + c / sum_variance ncomp = ncomp + 1 # reduce dimensionality samples = samples[:, :ncomp] LOGGER.info("Number of features after PCA transformation %s" % samples.shape[1]) return samples
X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) print(X_train.shape) X_train = svd.fit_transform(X_train) X_test = svd.transform(X_test) #u,o,X_train = fastica(X_train.toarray(),n_comp=1000) print(X_train) print(X_train.shape) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." def final_accuracy(predicted): count_same = 0 total = 0 for i in xrange(0,len(predicted)): tags_needed = question_tags[(target_files[i].split("/")[-1])] count_same += len(list(set(predicted[i]) & set(tags_needed)))
class RunRegression(object): REGRESSION_TRAINING_INPUT_FILE_NAME = "RegressionTrainingInput.npz" REGRESSION_TESTING_INPUT_FILE_NAME = "RegressionTestingInput.npz" MAXIMUM_NUMBER_OF_JOBS = -1 NUMBER_OF_CROSS_VALIDATION_FOLDS = 5 ROWS_TO_USE_FOR_GAUSSIAN_KERNEL_REGRESSION = 15 DISTRICT_SIZE = 132 TIME_SIZE = 152 POI_SIZE = 352 WEATHER_SIZE = 9 TRAFFIC_SIZE = 8 def __init__(self): self.components = 2 self.svd = TruncatedSVD(n_components=self.components) self.reductCount = 0 for file_name, data_set in [ (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME, FileIo.TRAINING_DATA_SET), (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME, FileIo.TEST_DATA_SET) ]: # Check and see if the data has already been saved try: logging.info("RunRegression: Trying to load " + data_set + " data") saved_data = numpy.load(file_name, mmap_mode='r') # If the data is not found, load it except IOError: logging.info( "RunRegression: Saved data not found. Generating " + data_set + " data") # Generate inputs poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup() order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup( poi_district_lookup) regression_input = RegressionInput.RegressionInput( data_set, order_categorical_lookup, poi_district_lookup) if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. training_order_start_end_districts_and_time, order_value_price=self.training_order_median_price, order_value_number=self.training_number_of_orders) else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. testing_order_start_end_districts_and_time, order_value_price=self.testing_order_median_price, order_value_number=self.testing_number_of_orders) # If the saved data is found, load it else: logging.info("RunRegression: Loading " + data_set + " data") if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] self.initial = self.training_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.training_number_of_orders)) + " train data rows") else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.initialTesting = self.testing_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.testing_number_of_orders)) + " test data rows") """ Run sgd regression """ def run_sgd_regression(self): losses = ["squared_loss"] penalties = ["none", "l2", "l1", "elasticnet"] initial_learning_rates = [0.1, 0.01, 0.001] learning_rates = ["constant", "optimal", "invscaling"] lowest_ride_prediction_error = float('inf') best_loss = "" best_penalty = "" best_initial_learning_rate = 0.0 best_learning_rate = "" # Find the best hyper-parameters for loss in losses: for penalty in penalties: for initial_learning_rate in initial_learning_rates: for learning_rate in learning_rates: mean_ride_prediction_error = 0.0 # Do k-fold cross-validation using mini-batch training. for testing_fold_number in range( RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS ): # Create the sgd regressor using the input parameters sgd_regressor = linear_model.SGDRegressor( loss=loss, penalty=penalty, eta0=initial_learning_rate, learning_rate=learning_rate) # Run mini batch training for the fold if its not the training fold for fold_number in range( RunRegression. NUMBER_OF_CROSS_VALIDATION_FOLDS): if fold_number == testing_fold_number: continue training_start_row = fold_number * \ len(self.training_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS training_end_row = (fold_number + 1) * \ len(self.training_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS logging.info( "RunRegression: " + str(RunRegression. NUMBER_OF_CROSS_VALIDATION_FOLDS) + " fold cross validation training SGD Regressor for fold " + str(fold_number) + ", starting row " + str(training_start_row) + ", ending row " + str(training_end_row) + ", loss " + loss + ", penalty " + penalty + ", initial learning rate " + str(initial_learning_rate) + " and learning rate " + learning_rate) # Train regression model sgd_regressor\ .partial_fit(X=self.training_order_start_end_districts_and_time[training_start_row : training_end_row], y=self.training_number_of_orders[training_start_row:training_end_row]) testing_start_row = testing_fold_number * \ len(self.testing_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS testing_end_row = (testing_fold_number + 1 )* \ len(self.testing_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS predicted_number_of_orders = sgd_regressor\ .predict(self.testing_order_start_end_districts_and_time[testing_start_row : testing_end_row]) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders[ testing_start_row:testing_end_row])**2) logging.info( "RunRegression: Prediction error for fold " + str(testing_fold_number) + " is " + str(current_ride_prediction_error)) mean_ride_prediction_error += current_ride_prediction_error if RunRegression.__is_mean_prediction_error_too_high( mean_ride_prediction_error, lowest_ride_prediction_error): logging.info( "RunRegression: Mean prediction error of " + str(mean_ride_prediction_error) + "is too high compared to best so far " + str(lowest_ride_prediction_error) + ". Ending current cross validation.") break else: mean_ride_prediction_error /= RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS logging.info( "RunRegression: Mean prediction error is " + str(mean_ride_prediction_error)) # Save values if better than previous best if mean_ride_prediction_error < lowest_ride_prediction_error: logging.info( "RunRegression: mean error of " + str(mean_ride_prediction_error) + " is the best so far. Saving loss " + loss + ", penalty " + penalty + ", initial learning rate " + str(initial_learning_rate) + " and learning rate " + learning_rate) lowest_ride_prediction_error = mean_ride_prediction_error best_loss = loss best_penalty = penalty best_initial_learning_rate = initial_learning_rate best_learning_rate = learning_rate logging.info( "RunRegression: Running regression with best values so far: loss " + best_loss + ", penalty " + best_penalty + ", initial learning rate " + str(best_initial_learning_rate) + " and learning rate " + best_learning_rate) sgd_regressor = linear_model.SGDRegressor( loss=best_loss, penalty=best_penalty, eta0=best_initial_learning_rate, learning_rate=best_learning_rate) sgd_regressor.fit(X=self.training_order_start_end_districts_and_time, y=self.training_number_of_orders) best_predicted_number_of_orders = sgd_regressor.predict( self.testing_order_start_end_districts_and_time) coef = sgd_regressor.coef_ print(coef) logging.info( "RunRegression: Mean squared prediction error after cross validation is " + str( numpy.mean((best_predicted_number_of_orders - self.testing_number_of_orders)**2))) """ Check if mean prediction error is to high to qualify as the best so far """ @staticmethod def __is_mean_prediction_error_too_high(cumulative_mean_prediction_error, best_prediction_error_so_far): return cumulative_mean_prediction_error / RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS > \ best_prediction_error_so_far """ Run regression based on multidimensional scaling """ def run_mds_regression(self): # Create a square matrix with number of test data rows preserved training_data_square_matrix = numpy.dot( self.training_order_start_end_districts_and_time.T, self.training_order_start_end_districts_and_time) logging.info("RunRegression: Square matrix shape " + str(training_data_square_matrix.shape)) # Get Eigen values and eigen vectors training_data_eigen_values, training_data_eigen_vectors = linalg.eig( training_data_square_matrix) #print(training_data_eigen_values) #print(training_data_eigen_vectors) print(self.training_order_start_end_districts_and_time) sorted_index = training_data_eigen_values.argsort()[::-1] sorted_training_data_eigen_values = training_data_eigen_values[ sorted_index] sorted_training_data_eigen_vectors = training_data_eigen_vectors[:, sorted_index] logging.info("RunRegression: Found " + str(len(sorted_training_data_eigen_values)) + " eigen values.") logging.info("RunRegression: Eigen vectors have length " + str(len(sorted_training_data_eigen_vectors[0]))) if logging.getLogger().getEffectiveLevel() == logging.DEBUG: RunRegression.__show_eigen_values_trend( eigen_values=sorted_training_data_eigen_values) """ Show Eigen values trend """ @staticmethod def __show_eigen_values_trend(self, eigen_values): # Plot eigen values plt.plot(eigen_values) plt.ylabel('Eigen Values') plt.title('Sorted Eigen Values') plt.show() def leastAngleRegression(self): lar = linear_model.Lars() lar.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = lar.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(lar.coef_) def orthogonalMatchingPursuit(self): omp = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=10) omp.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = omp.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(omp.coef_) def theilSenRegressor(self): tsr = linear_model.TheilSenRegressor() tsr.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = tsr.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(tsr.coef_) def polynomial(self): poly = PolynomialFeatures(degree=3) self.training_order_start_end_districts_and_time = poly.fit_transform( self.training_order_start_end_districts_and_time, self.training_number_of_orders) predict = poly.transform( self.testing_order_start_end_districts_and_time) clf = linear_model.LinearRegression() clf.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = clf.predict(predict) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(clf.coef_) def svm(self): oneClass = svm.OneClassSVM() logging.info("svm fit") oneClass.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) logging.info("svm predict") predicted_number_of_orders = oneClass.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(oneClass.coef_) def districtReduction(self, keyType, key): y = key districts = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.DISTRICT_SIZE) if keyType == "training": districtRed = self.svd.fit_transform( districts, self.training_number_of_orders) else: districtRed = self.svd.transform(districts) nonDistrict = numpy.apply_along_axis(sliceTransform, 1, y, self.DISTRICT_SIZE, self.dimensions) keyWithDist = numpy.append(districtRed, nonDistrict, axis=1) return keyWithDist def timeReduction(self, keyType, key): y = key time = numpy.apply_along_axis(sliceTransform, 1, y, self.components, self.TIME_SIZE + self.components) if keyType == "training": timeRed = self.svd.fit_transform(time, self.training_number_of_orders) else: timeRed = self.svd.transform(time) befTime = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components) aftTime = numpy.apply_along_axis(sliceTransform, 1, y, self.TIME_SIZE + self.components, self.dimensions) keyWithTime = numpy.append(befTime, timeRed, axis=1) keyWithTime = numpy.append(keyWithTime, aftTime, axis=1) return keyWithTime def POIReduction(self, keyType, key): y = key poi = numpy.apply_along_axis(sliceTransform, 1, y, self.components * 2, self.POI_SIZE + self.components * 2) if keyType == "training": poiRed = self.svd.fit_transform(poi, self.training_number_of_orders) else: poiRed = self.svd.transform(poi) befPoi = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components * 2) aftPoi = numpy.apply_along_axis(sliceTransform, 1, y, self.POI_SIZE + self.components * 2, self.dimensions) keyWithPoi = numpy.append(befPoi, poiRed, axis=1) keyWithPoi = numpy.append(keyWithPoi, aftPoi, axis=1) return keyWithPoi def WeatherReduction(self, keyType, key): y = key weather = numpy.apply_along_axis( sliceTransform, 1, y, self.components * 3, self.WEATHER_SIZE + self.components * 3) if keyType == "training": weatherRed = self.svd.fit_transform(weather, self.training_number_of_orders) else: weatherRed = self.svd.transform(weather) befWeather = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components * 3) aftWeather = numpy.apply_along_axis( sliceTransform, 1, y, self.WEATHER_SIZE + self.components * 3, self.dimensions) keyWithWeather = numpy.append(befWeather, weatherRed, axis=1) keyWithWeather = numpy.append(keyWithWeather, aftWeather, axis=1) return keyWithWeather def TrafficReduction(self, keyType, key): y = key traffic = numpy.apply_along_axis( sliceTransform, 1, y, self.components * 4, self.TRAFFIC_SIZE + self.components * 4) if keyType == "training": trafficRed = self.svd.fit_transform(traffic, self.training_number_of_orders) if self.reductCount == 0: self.boxPlot(trafficRed) self.reductCount = 1 else: trafficRed = self.svd.transform(traffic) befTraffic = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components * 4) keyWithTraffic = numpy.append(befTraffic, trafficRed, axis=1) return keyWithTraffic def wholeReductionTraining(self): y = self.training_order_start_end_districts_and_time b = self.svd.fit_transform(y, self.training_number_of_orders) if self.reductCount < 2: self.boxPlot(b) self.reductCount += 1 self.training_order_start_end_districts_and_time = b def wholeReductionTesting(self): y = self.testing_order_start_end_districts_and_time b = self.svd.transform(y) self.testing_order_start_end_districts_and_time = b def reduction(self): self.training_order_start_end_districts_and_time = self.initial self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] self.testing_order_start_end_districts_and_time = self.initialTesting logging.info("RunRegression: Reducing Districts") self.training_order_start_end_districts_and_time = run_regression.districtReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.districtReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.testing_order_start_end_districts_and_time[:, 0:1] y = self.testing_order_start_end_districts_and_time[:, 1:2] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] print(self.dimensions) logging.info("RunRegression: Reducing Time") self.training_order_start_end_districts_and_time = run_regression.timeReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.timeReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 2:3] y = self.training_order_start_end_districts_and_time[:, 3:4] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] #plt.scatter(x,y) #plt.show() logging.info("RunRegression: Reducing POI") self.training_order_start_end_districts_and_time = run_regression.POIReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.POIReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 4:5] y = self.training_order_start_end_districts_and_time[:, 5:6] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] #plt.scatter(x,y) #plt.show() logging.info("RunRegression: Reducing Weather") self.training_order_start_end_districts_and_time = run_regression.WeatherReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.WeatherReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 6:7] y = self.training_order_start_end_districts_and_time[:, 7:8] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] #plt.scatter(x,y) #plt.show() logging.info("RunRegression: Reducing Traffic") self.training_order_start_end_districts_and_time = run_regression.TrafficReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.TrafficReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 8:9] y = self.training_order_start_end_districts_and_time[:, 9:10] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] print(self.initial.shape) def boxPlot(self, arrayBox): a = plt.boxplot(arrayBox) plt.show() idx = set() idxSet = set( numpy.arange(len( self.training_order_start_end_districts_and_time))) for d in a['fliers']: print(len(d.get_ydata())) for point in d.get_ydata(): pIdx = numpy.where(arrayBox == point) for rIdx in pIdx[0]: idx.add(rIdx) logging.info("done with loop") idxKeep = list(idxSet.difference(idx)) self.initial = self.initial[[idxKeep], :] self.training_number_of_orders = self.training_number_of_orders[[ idxKeep ]] self.initial = self.initial.reshape(self.initial.shape[1:])
class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive wrapping for sklearn TruncatedSVD `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html>`_ Parameters ---------- n_components: int Desired dimensionality of output data. Must be strictly less than the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended. algorithm: hyperparams.Choice SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or "randomized" for the randomized algorithm due to Halko (2009). use_columns: Set A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. exclude_columns: Set A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. return_result: Enumeration Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. use_semantic_types: Bool Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. add_index_columns: Bool Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". error_on_no_input: Bool( Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. return_semantic_type: Enumeration[str]( Decides what semantic type to attach to generated attributes' """ __author__: "DATA Lab at Texas A&M University" metadata = metadata_base.PrimitiveMetadata({ "name": "Truncated SVD", "python_path": "d3m.primitives.tods.feature_analysis.truncated_svd", "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/SKTruncatedSVD.py']}, "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SINGULAR_VALUE_DECOMPOSITION, ], "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, "id": "9231fde3-7322-3c41-b4cf-d00a93558c44", "hyperparams_to_tune": ['n_components', 'algorithm', 'use_columns', 'exclude_columns', 'return_result', 'use_semantic_types', 'add_index_columns', 'error_on_no_input', 'return_semantic_type'], "version": "0.0.1", }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = TruncatedSVD( n_components=self.hyperparams['n_components'], algorithm=self.hyperparams['algorithm']['choice'], n_iter=self.hyperparams['algorithm'].get('n_iter', 5), tol=self.hyperparams['algorithm'].get('tol', 0), random_state=self.random_seed, ) self.primitiveNo = PrimitiveCount.primitive_no PrimitiveCount.primitive_no += 1 self._inputs = None self._outputs = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._input_column_names = None self._fitted = False def set_training_data(self, *, inputs: Inputs) -> None: """ Set training data for SKTruncatedSVD. Args: inputs: Container DataFrame Returns: None """ # self.logger.warning('set was called!') self._inputs = inputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fit model with training data. Args: *: Container DataFrame. Time series data up to fit. Returns: None """ if self._fitted: return CallResult(None) # Get cols to fit. self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns # If there is no cols to fit, return None if self._training_inputs is None: return CallResult(None) # Call SVD in sklearn and set _fitted to true if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Returns: Container DataFrame after Truncated SVD. """ # self.logger.warning(str(self.metadata.query()['name'])) if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.transform(sk_inputs) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._write(outputs) # self.logger.warning('produce was called!') return CallResult(outputs) def get_params(self) -> Params: """ Return parameters. Args: None Returns: class Params """ if not self._fitted: return Params( components_=None, explained_variance_ratio_=None, explained_variance_=None, singular_values_=None, input_column_names=self._input_column_names, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata ) return Params( components_=getattr(self._clf, 'components_', None), explained_variance_ratio_=getattr(self._clf, 'explained_variance_ratio_', None), explained_variance_=getattr(self._clf, 'explained_variance_', None), singular_values_=getattr(self._clf, 'singular_values_', None), input_column_names=self._input_column_names, training_indices_=self._training_indices, target_names_=self._target_names, target_column_indices_=self._target_column_indices, target_columns_metadata_=self._target_columns_metadata ) def set_params(self, *, params: Params) -> None: """ Set parameters for SKTruncatedSVD. Args: params: class Params Returns: None """ self._clf.components_ = params['components_'] self._clf.explained_variance_ratio_ = params['explained_variance_ratio_'] self._clf.explained_variance_ = params['explained_variance_'] self._clf.singular_values_ = params['singular_values_'] self._input_column_names = params['input_column_names'] self._training_indices = params['training_indices_'] self._target_names = params['target_names_'] self._target_column_indices = params['target_column_indices_'] self._target_columns_metadata = params['target_columns_metadata_'] if params['components_'] is not None: self._fitted = True if params['explained_variance_ratio_'] is not None: self._fitted = True if params['explained_variance_'] is not None: self._fitted = True if params['singular_values_'] is not None: self._fitted = True @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=hyperparams['use_columns'], exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: """ Output metadata of selected columns. Args: outputs_metadata: metadata_base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: d3m.metadata.base.DataMetadata """ outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = [] add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata @classmethod def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate(target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: """ Wrap predictions into dataframe Args: inputs: Container Dataframe predictions: array-like data (n_samples, n_features) Returns: Dataframe """ outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata def _write(self, inputs:Inputs): """ write inputs to current directory, only for test """ inputs.to_csv(str(time.time())+'.csv')