Exemplo n.º 1
0
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
Exemplo n.º 2
0
class TruncatedSVDImpl():
    def __init__(self,
                 n_components=2,
                 algorithm='randomized',
                 n_iter=5,
                 random_state=None,
                 tol=0.0):
        self._hyperparams = {
            'n_components': n_components,
            'algorithm': algorithm,
            'n_iter': n_iter,
            'random_state': random_state,
            'tol': tol
        }
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Exemplo n.º 3
0
def reduce_dimensionality(dataframe, maxvariance, columns_to_drop):
    '''
    Performs PCA on feature pandas dataframe and reduces number of
    principal components to those which explain a defined variance
    '''
    dataframe_without_columns = dataframe.drop(columns_to_drop, axis=1)
    LOGGER.info('Columns to be used by pca:')
    print dataframe_without_columns.columns
    LOGGER.info('Adding noise to dataframe')
    dataframe_without_columns = dataframe_without_columns + numpy.random.normal(
        size=dataframe_without_columns.shape) * 1.e-19
    LOGGER.info('Starting PCA')
    try:
        pca = PCA(n_components='mle')
        pca.fit(dataframe_without_columns)
        # transform
        samples = pca.transform(dataframe_without_columns)
        # aggregated sum of variances
        sum_variance = sum(pca.explained_variance_)
        list_variance = pca.explained_variance_
        #print sum_variance, pca.explained_variance_
        # get those having aggregated variance below threshold
    except ValueError:
        LOGGER.info('PCA failed, using truncated SVD')
        svd = TruncatedSVD(n_components=3)
        svd.fit(dataframe_without_columns)
        samples = svd.transform(dataframe_without_columns)
        sum_variance = sum(svd.explained_variance_)
        list_variance = svd.explained_variance_

    scomp = 0
    ncomp = 0
    while scomp < maxvariance:
        #c = pca.explained_variance_[ncomp]
        c = list_variance[ncomp]
        scomp = scomp + c / sum_variance
        ncomp = ncomp + 1
    # reduce dimensionality
    samples = samples[:, :ncomp]
    LOGGER.info("Number of features after PCA transformation %s" %
                samples.shape[1])
    return samples
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


print(X_train.shape)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)
#u,o,X_train = fastica(X_train.toarray(),n_comp=1000)
print(X_train)
print(X_train.shape)
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."



def final_accuracy(predicted):
    count_same = 0
    total = 0
    for i in xrange(0,len(predicted)):
        tags_needed =  question_tags[(target_files[i].split("/")[-1])]
        count_same += len(list(set(predicted[i]) & set(tags_needed)))
Exemplo n.º 5
0
class RunRegression(object):

    REGRESSION_TRAINING_INPUT_FILE_NAME = "RegressionTrainingInput.npz"
    REGRESSION_TESTING_INPUT_FILE_NAME = "RegressionTestingInput.npz"
    MAXIMUM_NUMBER_OF_JOBS = -1
    NUMBER_OF_CROSS_VALIDATION_FOLDS = 5
    ROWS_TO_USE_FOR_GAUSSIAN_KERNEL_REGRESSION = 15
    DISTRICT_SIZE = 132
    TIME_SIZE = 152
    POI_SIZE = 352
    WEATHER_SIZE = 9
    TRAFFIC_SIZE = 8

    def __init__(self):

        self.components = 2
        self.svd = TruncatedSVD(n_components=self.components)
        self.reductCount = 0
        for file_name, data_set in [
            (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME,
             FileIo.TRAINING_DATA_SET),
            (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME,
             FileIo.TEST_DATA_SET)
        ]:

            # Check and see if the data has already been saved
            try:

                logging.info("RunRegression: Trying to load " + data_set +
                             " data")

                saved_data = numpy.load(file_name, mmap_mode='r')

            # If the data is not found, load it
            except IOError:

                logging.info(
                    "RunRegression: Saved data not found. Generating " +
                    data_set + " data")

                # Generate inputs
                poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup()
                order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup(
                    poi_district_lookup)
                regression_input = RegressionInput.RegressionInput(
                    data_set, order_categorical_lookup, poi_district_lookup)

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        training_order_start_end_districts_and_time,
                        order_value_price=self.training_order_median_price,
                        order_value_number=self.training_number_of_orders)

                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders  = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        testing_order_start_end_districts_and_time,
                        order_value_price=self.testing_order_median_price,
                        order_value_number=self.testing_number_of_orders)

            # If the saved data is found, load it
            else:

                logging.info("RunRegression: Loading " + data_set + " data")

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = saved_data['order_keys'], \
                                                         saved_data['order_value_price'], \
                                                         saved_data['order_value_number']

                    self.dimensions = self.training_order_start_end_districts_and_time.shape[
                        1]
                    self.initial = self.training_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.training_number_of_orders)) +
                                 " train data rows")
                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders = saved_data['order_keys'], \
                                                        saved_data['order_value_price'], \
                                                        saved_data['order_value_number']

                    self.initialTesting = self.testing_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.testing_number_of_orders)) +
                                 " test data rows")

    """
    Run sgd regression
    """

    def run_sgd_regression(self):

        losses = ["squared_loss"]
        penalties = ["none", "l2", "l1", "elasticnet"]
        initial_learning_rates = [0.1, 0.01, 0.001]
        learning_rates = ["constant", "optimal", "invscaling"]

        lowest_ride_prediction_error = float('inf')

        best_loss = ""
        best_penalty = ""
        best_initial_learning_rate = 0.0
        best_learning_rate = ""

        # Find the best hyper-parameters
        for loss in losses:
            for penalty in penalties:
                for initial_learning_rate in initial_learning_rates:
                    for learning_rate in learning_rates:

                        mean_ride_prediction_error = 0.0

                        # Do k-fold cross-validation using mini-batch training.
                        for testing_fold_number in range(
                                RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS
                        ):

                            # Create the sgd regressor using the input parameters
                            sgd_regressor = linear_model.SGDRegressor(
                                loss=loss,
                                penalty=penalty,
                                eta0=initial_learning_rate,
                                learning_rate=learning_rate)

                            # Run mini batch training for the fold if its not the training fold
                            for fold_number in range(
                                    RunRegression.
                                    NUMBER_OF_CROSS_VALIDATION_FOLDS):

                                if fold_number == testing_fold_number:
                                    continue

                                training_start_row = fold_number * \
                                                     len(self.training_order_start_end_districts_and_time) // \
                                                     RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                                training_end_row = (fold_number + 1) * \
                                                   len(self.training_order_start_end_districts_and_time) // \
                                                    RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                                logging.info(
                                    "RunRegression: " +
                                    str(RunRegression.
                                        NUMBER_OF_CROSS_VALIDATION_FOLDS) +
                                    " fold cross validation training SGD Regressor for fold "
                                    + str(fold_number) + ", starting row " +
                                    str(training_start_row) + ", ending row " +
                                    str(training_end_row) + ", loss " + loss +
                                    ", penalty " + penalty +
                                    ", initial learning rate " +
                                    str(initial_learning_rate) +
                                    " and learning rate " + learning_rate)

                                # Train regression model
                                sgd_regressor\
                                   .partial_fit(X=self.training_order_start_end_districts_and_time[training_start_row :
                                                                                                   training_end_row],
                                                y=self.training_number_of_orders[training_start_row:training_end_row])

                            testing_start_row = testing_fold_number * \
                                                len(self.testing_order_start_end_districts_and_time) // \
                                                 RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            testing_end_row = (testing_fold_number + 1 )* \
                                                len(self.testing_order_start_end_districts_and_time) // \
                                                 RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            predicted_number_of_orders = sgd_regressor\
                                .predict(self.testing_order_start_end_districts_and_time[testing_start_row :
                                                                                         testing_end_row])

                            current_ride_prediction_error = numpy.mean(
                                (predicted_number_of_orders -
                                 self.testing_number_of_orders[
                                     testing_start_row:testing_end_row])**2)

                            logging.info(
                                "RunRegression: Prediction error for fold " +
                                str(testing_fold_number) + " is " +
                                str(current_ride_prediction_error))

                            mean_ride_prediction_error += current_ride_prediction_error

                            if RunRegression.__is_mean_prediction_error_too_high(
                                    mean_ride_prediction_error,
                                    lowest_ride_prediction_error):
                                logging.info(
                                    "RunRegression: Mean prediction error of "
                                    + str(mean_ride_prediction_error) +
                                    "is too high compared to best so far " +
                                    str(lowest_ride_prediction_error) +
                                    ". Ending current cross validation.")
                                break

                        else:

                            mean_ride_prediction_error /= RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            logging.info(
                                "RunRegression: Mean prediction error is " +
                                str(mean_ride_prediction_error))

                            # Save values if better than previous best
                            if mean_ride_prediction_error < lowest_ride_prediction_error:

                                logging.info(
                                    "RunRegression: mean error of " +
                                    str(mean_ride_prediction_error) +
                                    " is the best so far. Saving loss " +
                                    loss + ", penalty " + penalty +
                                    ", initial learning rate " +
                                    str(initial_learning_rate) +
                                    " and learning rate " + learning_rate)

                                lowest_ride_prediction_error = mean_ride_prediction_error
                                best_loss = loss
                                best_penalty = penalty
                                best_initial_learning_rate = initial_learning_rate
                                best_learning_rate = learning_rate

        logging.info(
            "RunRegression: Running regression with best values so far: loss "
            + best_loss + ", penalty " + best_penalty +
            ", initial learning rate " + str(best_initial_learning_rate) +
            " and learning rate " + best_learning_rate)

        sgd_regressor = linear_model.SGDRegressor(
            loss=best_loss,
            penalty=best_penalty,
            eta0=best_initial_learning_rate,
            learning_rate=best_learning_rate)

        sgd_regressor.fit(X=self.training_order_start_end_districts_and_time,
                          y=self.training_number_of_orders)
        best_predicted_number_of_orders = sgd_regressor.predict(
            self.testing_order_start_end_districts_and_time)

        coef = sgd_regressor.coef_
        print(coef)

        logging.info(
            "RunRegression: Mean squared prediction error after cross validation is "
            + str(
                numpy.mean((best_predicted_number_of_orders -
                            self.testing_number_of_orders)**2)))

    """
    Check if mean prediction error is to high to qualify as the best so far
    """

    @staticmethod
    def __is_mean_prediction_error_too_high(cumulative_mean_prediction_error,
                                            best_prediction_error_so_far):

        return cumulative_mean_prediction_error / RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS > \
               best_prediction_error_so_far

    """
    Run regression based on multidimensional scaling
    """

    def run_mds_regression(self):

        # Create a square matrix with number of test data rows preserved
        training_data_square_matrix = numpy.dot(
            self.training_order_start_end_districts_and_time.T,
            self.training_order_start_end_districts_and_time)

        logging.info("RunRegression: Square matrix shape " +
                     str(training_data_square_matrix.shape))

        # Get Eigen values and eigen vectors
        training_data_eigen_values, training_data_eigen_vectors = linalg.eig(
            training_data_square_matrix)
        #print(training_data_eigen_values)
        #print(training_data_eigen_vectors)
        print(self.training_order_start_end_districts_and_time)
        sorted_index = training_data_eigen_values.argsort()[::-1]
        sorted_training_data_eigen_values = training_data_eigen_values[
            sorted_index]
        sorted_training_data_eigen_vectors = training_data_eigen_vectors[:,
                                                                         sorted_index]

        logging.info("RunRegression: Found " +
                     str(len(sorted_training_data_eigen_values)) +
                     " eigen values.")
        logging.info("RunRegression: Eigen vectors have length " +
                     str(len(sorted_training_data_eigen_vectors[0])))

        if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
            RunRegression.__show_eigen_values_trend(
                eigen_values=sorted_training_data_eigen_values)

    """
    Show Eigen values trend
    """

    @staticmethod
    def __show_eigen_values_trend(self, eigen_values):

        # Plot eigen values
        plt.plot(eigen_values)
        plt.ylabel('Eigen Values')
        plt.title('Sorted Eigen Values')
        plt.show()

    def leastAngleRegression(self):
        lar = linear_model.Lars()
        lar.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = lar.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(lar.coef_)

    def orthogonalMatchingPursuit(self):
        omp = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=10)
        omp.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = omp.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(omp.coef_)

    def theilSenRegressor(self):
        tsr = linear_model.TheilSenRegressor()
        tsr.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = tsr.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(tsr.coef_)

    def polynomial(self):
        poly = PolynomialFeatures(degree=3)
        self.training_order_start_end_districts_and_time = poly.fit_transform(
            self.training_order_start_end_districts_and_time,
            self.training_number_of_orders)
        predict = poly.transform(
            self.testing_order_start_end_districts_and_time)

        clf = linear_model.LinearRegression()
        clf.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = clf.predict(predict)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(clf.coef_)

    def svm(self):
        oneClass = svm.OneClassSVM()
        logging.info("svm fit")
        oneClass.fit(self.training_order_start_end_districts_and_time,
                     self.training_number_of_orders)
        logging.info("svm predict")
        predicted_number_of_orders = oneClass.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(oneClass.coef_)

    def districtReduction(self, keyType, key):
        y = key
        districts = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                           self.DISTRICT_SIZE)
        if keyType == "training":
            districtRed = self.svd.fit_transform(
                districts, self.training_number_of_orders)
        else:
            districtRed = self.svd.transform(districts)
        nonDistrict = numpy.apply_along_axis(sliceTransform, 1, y,
                                             self.DISTRICT_SIZE,
                                             self.dimensions)
        keyWithDist = numpy.append(districtRed, nonDistrict, axis=1)
        return keyWithDist

    def timeReduction(self, keyType, key):
        y = key
        time = numpy.apply_along_axis(sliceTransform, 1, y, self.components,
                                      self.TIME_SIZE + self.components)
        if keyType == "training":
            timeRed = self.svd.fit_transform(time,
                                             self.training_number_of_orders)
        else:
            timeRed = self.svd.transform(time)
        befTime = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                         self.components)
        aftTime = numpy.apply_along_axis(sliceTransform, 1, y,
                                         self.TIME_SIZE + self.components,
                                         self.dimensions)
        keyWithTime = numpy.append(befTime, timeRed, axis=1)
        keyWithTime = numpy.append(keyWithTime, aftTime, axis=1)
        return keyWithTime

    def POIReduction(self, keyType, key):
        y = key
        poi = numpy.apply_along_axis(sliceTransform, 1, y, self.components * 2,
                                     self.POI_SIZE + self.components * 2)
        if keyType == "training":
            poiRed = self.svd.fit_transform(poi,
                                            self.training_number_of_orders)
        else:
            poiRed = self.svd.transform(poi)
        befPoi = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                        self.components * 2)
        aftPoi = numpy.apply_along_axis(sliceTransform, 1, y,
                                        self.POI_SIZE + self.components * 2,
                                        self.dimensions)
        keyWithPoi = numpy.append(befPoi, poiRed, axis=1)
        keyWithPoi = numpy.append(keyWithPoi, aftPoi, axis=1)
        return keyWithPoi

    def WeatherReduction(self, keyType, key):
        y = key
        weather = numpy.apply_along_axis(
            sliceTransform, 1, y, self.components * 3,
            self.WEATHER_SIZE + self.components * 3)
        if keyType == "training":
            weatherRed = self.svd.fit_transform(weather,
                                                self.training_number_of_orders)
        else:
            weatherRed = self.svd.transform(weather)
        befWeather = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                            self.components * 3)
        aftWeather = numpy.apply_along_axis(
            sliceTransform, 1, y, self.WEATHER_SIZE + self.components * 3,
            self.dimensions)
        keyWithWeather = numpy.append(befWeather, weatherRed, axis=1)
        keyWithWeather = numpy.append(keyWithWeather, aftWeather, axis=1)
        return keyWithWeather

    def TrafficReduction(self, keyType, key):
        y = key
        traffic = numpy.apply_along_axis(
            sliceTransform, 1, y, self.components * 4,
            self.TRAFFIC_SIZE + self.components * 4)
        if keyType == "training":
            trafficRed = self.svd.fit_transform(traffic,
                                                self.training_number_of_orders)
            if self.reductCount == 0:
                self.boxPlot(trafficRed)
                self.reductCount = 1
        else:
            trafficRed = self.svd.transform(traffic)
        befTraffic = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                            self.components * 4)
        keyWithTraffic = numpy.append(befTraffic, trafficRed, axis=1)
        return keyWithTraffic

    def wholeReductionTraining(self):
        y = self.training_order_start_end_districts_and_time
        b = self.svd.fit_transform(y, self.training_number_of_orders)
        if self.reductCount < 2:
            self.boxPlot(b)
        self.reductCount += 1
        self.training_order_start_end_districts_and_time = b

    def wholeReductionTesting(self):
        y = self.testing_order_start_end_districts_and_time
        b = self.svd.transform(y)
        self.testing_order_start_end_districts_and_time = b

    def reduction(self):
        self.training_order_start_end_districts_and_time = self.initial
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        self.testing_order_start_end_districts_and_time = self.initialTesting

        logging.info("RunRegression: Reducing Districts")
        self.training_order_start_end_districts_and_time = run_regression.districtReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.districtReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.testing_order_start_end_districts_and_time[:, 0:1]
        y = self.testing_order_start_end_districts_and_time[:, 1:2]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        print(self.dimensions)

        logging.info("RunRegression: Reducing Time")
        self.training_order_start_end_districts_and_time = run_regression.timeReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.timeReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 2:3]
        y = self.training_order_start_end_districts_and_time[:, 3:4]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing POI")
        self.training_order_start_end_districts_and_time = run_regression.POIReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.POIReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 4:5]
        y = self.training_order_start_end_districts_and_time[:, 5:6]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing Weather")
        self.training_order_start_end_districts_and_time = run_regression.WeatherReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.WeatherReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 6:7]
        y = self.training_order_start_end_districts_and_time[:, 7:8]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing Traffic")
        self.training_order_start_end_districts_and_time = run_regression.TrafficReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.TrafficReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 8:9]
        y = self.training_order_start_end_districts_and_time[:, 9:10]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        print(self.initial.shape)

    def boxPlot(self, arrayBox):
        a = plt.boxplot(arrayBox)
        plt.show()
        idx = set()
        idxSet = set(
            numpy.arange(len(
                self.training_order_start_end_districts_and_time)))
        for d in a['fliers']:
            print(len(d.get_ydata()))
            for point in d.get_ydata():
                pIdx = numpy.where(arrayBox == point)
                for rIdx in pIdx[0]:
                    idx.add(rIdx)
        logging.info("done with loop")
        idxKeep = list(idxSet.difference(idx))
        self.initial = self.initial[[idxKeep], :]
        self.training_number_of_orders = self.training_number_of_orders[[
            idxKeep
        ]]
        self.initial = self.initial.reshape(self.initial.shape[1:])
Exemplo n.º 6
0
class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Primitive wrapping for sklearn TruncatedSVD
    `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html>`_   

    Parameters
    ----------
    n_components: int
        Desired dimensionality of output data. Must be strictly less than the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended.

    algorithm: hyperparams.Choice
       SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or "randomized" for the randomized algorithm due to Halko (2009).
    
    use_columns: Set
        A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.
    
    exclude_columns: Set
        A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.
    
    return_result: Enumeration
        Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.
    
    use_semantic_types: Bool
        Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe.
    
    add_index_columns: Bool
        Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".
    
    error_on_no_input: Bool(
        Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.
    
    return_semantic_type: Enumeration[str](
        Decides what semantic type to attach to generated attributes'
    """    

    __author__: "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({
         "name": "Truncated SVD",
         "python_path": "d3m.primitives.tods.feature_analysis.truncated_svd",
         "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 
         'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/SKTruncatedSVD.py']},
         "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SINGULAR_VALUE_DECOMPOSITION, ],
         "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,
         "id": "9231fde3-7322-3c41-b4cf-d00a93558c44",
         "hyperparams_to_tune": ['n_components', 'algorithm', 'use_columns', 'exclude_columns', 'return_result', 'use_semantic_types', 'add_index_columns', 'error_on_no_input', 'return_semantic_type'],
         "version": "0.0.1",
    })

    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        
        # False
        self._clf = TruncatedSVD(
              n_components=self.hyperparams['n_components'],
              algorithm=self.hyperparams['algorithm']['choice'],
              n_iter=self.hyperparams['algorithm'].get('n_iter', 5),
              tol=self.hyperparams['algorithm'].get('tol', 0),
              random_state=self.random_seed,
        )

        self.primitiveNo = PrimitiveCount.primitive_no
        PrimitiveCount.primitive_no += 1

        
        
        self._inputs = None
        self._outputs = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._input_column_names = None
        self._fitted = False
        
        
    def set_training_data(self, *, inputs: Inputs) -> None:
        """
        Set training data for SKTruncatedSVD.
        Args:
            inputs: Container DataFrame

        Returns:
            None
        """
        # self.logger.warning('set was called!')
        self._inputs = inputs
        self._fitted = False
        
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Fit model with training data.
        Args:
            *: Container DataFrame. Time series data up to fit.

        Returns:
            None
        """
        if self._fitted:
            return CallResult(None)

        # Get cols to fit.
        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        # If there is no cols to fit, return None
        if self._training_inputs is None:
            return CallResult(None)

        # Call SVD in sklearn and set _fitted to true
        if len(self._training_indices) > 0:
            self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        return CallResult(None)
        
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame.

        Returns:
            Container DataFrame after Truncated SVD.
        """
        # self.logger.warning(str(self.metadata.query()['name']))


        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._clf.transform(sk_inputs)
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            
            outputs = self._wrap_predictions(inputs, sk_output)
            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                               add_index_columns=self.hyperparams['add_index_columns'],
                                               inputs=inputs, column_indices=self._training_indices,
                                               columns_list=output_columns)

        # self._write(outputs)
        # self.logger.warning('produce was called!')
        return CallResult(outputs)
        

    def get_params(self) -> Params:
        """
        Return parameters.
        Args:
            None

        Returns:
            class Params
        """
        if not self._fitted:
            return Params(
                components_=None,
                explained_variance_ratio_=None,
                explained_variance_=None,
                singular_values_=None,
                input_column_names=self._input_column_names,
                training_indices_=self._training_indices,
                target_names_=self._target_names,
                target_column_indices_=self._target_column_indices,
                target_columns_metadata_=self._target_columns_metadata
            )

        return Params(
            components_=getattr(self._clf, 'components_', None),
            explained_variance_ratio_=getattr(self._clf, 'explained_variance_ratio_', None),
            explained_variance_=getattr(self._clf, 'explained_variance_', None),
            singular_values_=getattr(self._clf, 'singular_values_', None),
            input_column_names=self._input_column_names,
            training_indices_=self._training_indices,
            target_names_=self._target_names,
            target_column_indices_=self._target_column_indices,
            target_columns_metadata_=self._target_columns_metadata
        )

    def set_params(self, *, params: Params) -> None:
        """
        Set parameters for SKTruncatedSVD.
        Args:
            params: class Params

        Returns:
            None
        """
        self._clf.components_ = params['components_']
        self._clf.explained_variance_ratio_ = params['explained_variance_ratio_']
        self._clf.explained_variance_ = params['explained_variance_']
        self._clf.singular_values_ = params['singular_values_']
        self._input_column_names = params['input_column_names']
        self._training_indices = params['training_indices_']
        self._target_names = params['target_names_']
        self._target_column_indices = params['target_column_indices_']
        self._target_columns_metadata = params['target_columns_metadata_']
        
        if params['components_'] is not None:
            self._fitted = True
        if params['explained_variance_ratio_'] is not None:
            self._fitted = True
        if params['explained_variance_'] is not None:
            self._fitted = True
        if params['singular_values_'] is not None:
            self._fitted = True

   
    
    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
                                                                             use_columns=hyperparams['use_columns'],
                                                                             exclude_columns=hyperparams['exclude_columns'],
                                                                             can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
        """
        Output whether a column can be processed.
        Args:
            inputs_metadata: d3m.metadata.base.DataMetadata
            column_index: int

        Returns:
            bool
        """
        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False
        
        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False
    

    @classmethod
    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
        """
        Output metadata of selected columns.
        Args:
            outputs_metadata: metadata_base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = []
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata
    
    @classmethod
    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
        """
        Updata metadata for selected columns.
        Args:
            inputs_metadata: metadata_base.DataMetadata
            outputs: Container Dataframe
            target_columns_metadata: list

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
        """
        Wrap predictions into dataframe
        Args:
            inputs: Container Dataframe
            predictions: array-like data (n_samples, n_features)

        Returns:
            Dataframe
        """
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo)
        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
        return outputs


    @classmethod
    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo):
        """
        Add target columns metadata
        Args:
            outputs_metadata: metadata.base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            List[OrderedDict]
        """
        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index)
            column_metadata = OrderedDict()
            semantic_types = set()
            semantic_types.add(hyperparams["return_semantic_type"])
            column_metadata['semantic_types'] = list(semantic_types)

            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    def _write(self, inputs:Inputs):
        """
        write inputs to current directory, only for test
        """
        inputs.to_csv(str(time.time())+'.csv')