Пример #1
0
    def __rolling_window_test(self, data, window_size, test_size, step=1):
        print("\t\tRolling Window Validation Results:")

        # TODO: Hide the STDOUT of pp.split() and __fit_model(), and prevent __fit_model() from saving a .pkl on each run

        windows = [data.loc[idx * step:(idx * step) + round(window_size * len(data))] for idx in range(int((len(data) - round(window_size * len(data))) / step))]
        decoupled_windows = [pp.split(window, test_size=test_size, balanced=False) for window in windows] # TODO: Do a nonrandom split to respect the temporal order of observations

        results = {"accuracy": [], "precision": [], "specificity": [], "sensitivity": []}
        for feature_set in decoupled_windows:
            self.x_train, self.x_test, self.y_train, self.y_test = feature_set

            self.scaler = StandardScaler()
            self.scaler.fit(self.x_train)

            self.__fit_model()

            self.y_pred = self.model.predict(self.scaler.transform(self.x_test))

            results["accuracy"].append(analysis.accuracy(self.y_pred, self.y_test))
            results["precision"].append(analysis.precision(self.y_pred, self.y_test))
            results["specificity"].append(analysis.specificity(self.y_pred, self.y_test))
            results["sensitivity"].append(analysis.sensitivity(self.y_pred, self.y_test))

        print("\t\t\tAccuracy: ", str(sum(results["accuracy"]) / float(len(results["accuracy"]))))
        print("\t\t\tPrecision: ", str(sum(results["precision"]) / float(len(results["precision"]))))
        print("\t\t\tSpecificity: ", str(sum(results["specificity"]) / float(len(results["specificity"]))))
        print("\t\t\tSensitivity: ", str(sum(results["sensitivity"]) / float(len(results["sensitivity"]))))
Пример #2
0
def main(args):
	set_settings(args)
	data,gt = fetch(args)
	train_data,train_gt,test_data,test_gt,train_names,test_names = pp.split(data,gt,
																			args.split_method,
																			train_share=args.settings["train_share"],
																			test_share=args.settings["test_share"],
																			names=args.names,
																			return_names=True)
	#print(train_names)
	print(len(train_data))
	#print(test_names)
	print(len(test_data))

	try: 
		if args.settings["self_test"]:
			test_data = train_data
			test_gt = train_gt
			test_names = train_names
	except KeyError:
		pass
	
	args.train_names = train_names
	args.test_names = test_names
	#print(len(train_data))

	#probe_data,probe_gt = fetch(args,"PROBE",data[0].shape[1])
	#print(len(probe_data))
	#print(probe_data[0])

	print("Training")
	model = train(train_data,train_gt,args.model,args)
	test_model(model,test_data,test_gt,args)
    def preprocess(self, document, info=[]):
        document = to_unicode(document, info)
        words = tokenize(document)

        if self.split:
            words = split(words)

        if self.lower:
            words = (word.lower() for word in words)

        if self.remove_stops:
            words = remove_stops(words, STOPS)

        def include(word):
            return len(word) >= self.min_len and len(word) <= self.max_len
        words = (word for word in words if include(word))
        return words
Пример #4
0
    def preprocess(self, document, info=[]):
        document = preprocessing.to_unicode(document, info)
        words = preprocessing.tokenize(document)

        if self.split:
            words = preprocessing.split(words)

        if self.lower:
            words = (word.lower() for word in words)

        if self.remove_stops:
            words = preprocessing.remove_stops(words, preprocessing.FOX_STOPS)
            words = preprocessing.remove_stops(words, preprocessing.JAVA_RESERVED)

        def include(word):
            return len(word) >= self.min_len and len(word) <= self.max_len
        words = (word for word in words if include(word))
        return words
Пример #5
0
    def preprocess(self, document, info=[]):
        document = preprocessing.to_unicode(document, info)
        words = preprocessing.tokenize(document)

        if self.split:
            words = preprocessing.split(words)

        if self.lower:
            words = (word.lower() for word in words)

        if self.remove_stops:
            words = preprocessing.remove_stops(words, preprocessing.FOX_STOPS)
            words = preprocessing.remove_stops(words, preprocessing.JAVA_RESERVED)

        def include(word):
            return len(word) >= self.min_len and len(word) <= self.max_len
        words = (word for word in words if include(word))
        return words
Пример #6
0
    def __rolling_window_test(self, data, window_size, test_size, step=1):
        print("\t\tRolling Window Validation Results:")

        # TODO: Hide the STDOUT of pp.split() and __fit_model(), and prevent __fit_model() from saving a .pkl on each run

        windows = [
            data.loc[idx * step:(idx * step) + round(window_size * len(data))]
            for idx in range(
                int((len(data) - round(window_size * len(data))) / step))
        ]
        decoupled_windows = [
            pp.split(window, test_size=test_size, balanced=False)
            for window in windows
        ]

        results = {"accuracy": [], "precision": [], "f1": [], "recall": []}
        for feature_set in decoupled_windows:
            self.x_train, self.x_test, self.y_train, self.y_test = feature_set

            self.__fit_model()

            self.y_pred = self.model.predict(self.x_test)

            results["accuracy"].append(
                analysis.accuracy(self.y_test, self.y_pred))
            results["precision"].append(
                analysis.precision(self.y_test, self.y_pred,
                                   weighted_avg=True))
            results["recall"].append(
                analysis.recall(self.y_test, self.y_pred, weighted_avg=True))
            results["f1"].append(analysis.f1(self.y_test, self.y_pred))

        print("\t\t\tAccuracy: ",
              str(sum(results["accuracy"]) / float(len(results["accuracy"]))))
        print(
            "\t\t\tPrecision: ",
            str(sum(results["precision"]) / float(len(results["precision"]))))
        print("\t\t\tRecall: ",
              str(sum(results["recall"]) / float(len(results["recall"]))))
        print("\t\t\tF1: ",
              str(sum(results["f1"]) / float(len(results["f1"]))))
Пример #7
0
def main():

    print("Fetching data")

    price_data = scraper.fetch_data(
        os.path.dirname(os.getcwd()) + "/data/price_data.csv")
    blockchain_data = scraper.fetch_data(
        os.path.dirname(os.getcwd()) + "/data/blockchain_data.csv")
    coindesk_headlines = pd.read_csv(os.path.dirname(os.getcwd()) +
                                     "/data/scored_headlines_sentiment.csv",
                                     usecols=["Headline", "Sentiment"],
                                     sep=",")

    # Preprocessing #

    ####
    ## START Sentiment Analysis Block
    ####

    print("Sentiment Analysis")
    coindesk_headlines, stemmed = pp.sentiment_preprocessing(
        coindesk_headlines)

    # Create bag of words model.
    coindesk_headlines = pp.make_bag_of_words(coindesk_headlines, stemmed)

    x_train, x_test, y_train, y_test = pp.headlines_balanced_split(
        coindesk_headlines, test_size=.2)

    print("\nFitting sentiment models...\n")

    rand_forest = SentimentModel(estimator="RandomForest",
                                 train_set=(x_train, y_train),
                                 test_set=(x_test, y_test))

    grad_boost = SentimentModel(estimator="GradientBoosting",
                                train_set=(x_train, y_train),
                                test_set=(x_test, y_test))

    support_vec = SentimentModel(estimator="SupportVectorClassifier",
                                 train_set=(x_train, y_train),
                                 test_set=(x_test, y_test))

    # Evaluation #

    print("\nEvaluating sentiment models...\n")

    conf_matrix_counter = 0

    # Random Forest Classifier
    print("\tRandom Forest Classifier")
    analysis.plot_cnf_matrix(rand_forest.y_pred, rand_forest.y_test)
    rand_forest.cross_validate(method="Holdout")

    # Gradient Boosting Classifier
    print("\tGradient Boosting Classifier")
    analysis.plot_cnf_matrix(grad_boost.y_pred, grad_boost.y_test)
    grad_boost.cross_validate(method="Holdout")

    # Support Vector Classifier
    print("\tSupport Vector Classifier")
    analysis.plot_cnf_matrix(support_vec.y_pred, support_vec.y_test)
    support_vec.cross_validate(method="Holdout")

    ####
    ## END Sentiment Analysis Block
    ####

    print("Preprocessing")

    data = (
        price_data.pipe(pp.calculate_indicators).pipe(
            pp.merge_datasets,
            other_sets=[blockchain_data
                        ])  # [blockchain_data, coindesk_headlines]
        .pipe(pp.binarize_labels).pipe(pp.fix_null_vals).pipe(
            pp.add_lag_variables, lag=3).pipe(pp.power_transform))
    x_train, x_test, y_train, y_test = pp.split(data,
                                                test_size=.2,
                                                balanced=True)

    # Exploratory Analysis #

    print("Analyzing features")

    #print(data.describe())
    analysis.plot_corr_matrix(data)

    # Fitting Models #

    print("Fitting models")

    log_reg = Model(estimator="LogisticRegression",
                    train_set=(x_train, y_train),
                    test_set=(x_test, y_test),
                    select_features="RecursiveFE",
                    optimize=OPTIMIZE)
    rand_forest = Model(estimator="RandomForest",
                        train_set=(x_train, y_train),
                        test_set=(x_test, y_test),
                        select_features="RecursiveFE",
                        optimize=OPTIMIZE)
    grad_boost = Model(estimator="GradientBoosting",
                       train_set=(x_train, y_train),
                       test_set=(x_test, y_test),
                       select_features="RecursiveFE",
                       optimize=OPTIMIZE)

    # Evaluation #

    print("Evaluating")

    # Logistic Regression
    print("\tLogistic Regression Estimator")
    log_reg.plot_cnf_matrix()
    log_reg.cross_validate(method="Holdout")
    log_reg.cross_validate(method="RollingWindow",
                           data=data,
                           window_size=.9,
                           test_size=.1)

    # Random Forest
    print("\tRandom Forest Classifier")
    rand_forest.plot_cnf_matrix()
    rand_forest.cross_validate(method="holdout")
    rand_forest.cross_validate(method="RollingWindow",
                               data=data,
                               window_size=.9,
                               test_size=.1)

    # Gradient Boosting
    print("\tGradient Boosting Classifier")
    grad_boost.plot_cnf_matrix()
    grad_boost.cross_validate(method="holdout")
    grad_boost.cross_validate(method="RollingWindow",
                              data=data,
                              window_size=.9,
                              test_size=.1)
Пример #8
0
from preprocessing import split, build_monthly_trends
from supply_planning import plan_supply
import argparse

np.random.seed(1)

pd.set_option('display.max_columns', None)

headcount = pd.read_excel("../res/Demandv1.1.xlsx", 'Headcount')

demand_trend = pd.read_excel("../res/Demandv1.1.xlsx",
                             'Demand Trend Last year')

monthly_demand_trends, utils = build_monthly_trends(demand_trend)

billable, bench = split(headcount)
employees = (billable, bench)

revenue_per_billable_r = 900
cost_per_r = 685
total_bench_budget = 5760000
attrition = 0.2
max_r = 12000
notice_period = 2

parser = argparse.ArgumentParser()
parser.add_argument(
    '-br',
    '--billable_revenue',
    help='Revenue that a billable employee brings to the companypytho',
    required=False)
def main():
    names = data_handling.read_names(["keynote"])
    samples, targets, zero_length_indices = data_handling.dataset(names)
    names = np.delete(names, zero_length_indices)

    print("Original shapes, samples: {} \t targets : {} \t names: {}".format(
        np.shape(samples), np.shape(targets), np.shape(names)))
    unique, counts = np.unique(targets, return_counts=True)
    print("Class breakdown: {}".format(dict(zip(unique, counts))))

    # Samples Preprocessing
    samples = preprocessing.scale_select(samples)
    print("Feature selection shapes, samples: {} \t targets : {}\n".format(
        np.shape(samples), np.shape(targets)))
    # ###########

    # Dimensionality reduction and classification exploration
    if (False):
        dimensionalities = [100, 150, 170, 200]
        perplexities = [35, 40, 45, 50]
        learning_rates = [500, 800]
        for dimensionality in dimensionalities:
            for perplexity in perplexities:
                for l_r in learning_rates:
                    print(
                        "Dimensionality: {} \t Perplexity: {} \t Learning rate : {}"
                        .format(dimensionality, perplexity, l_r))

                    # Reduction
                    print("Reduction")
                    reduced_samples = tSNE_IO.load_or_reduce(samples)
                    print("Reduced samples shape: {}".format(
                        np.shape(reduced_samples)))
                    # ########

                    # Class balancing
                    data = np.hstack(
                        (reduced_samples, np.reshape(targets,
                                                     (len(targets), 1))))
                    data, names = preprocessing.balance_classes(
                        data, names,
                        np.shape(reduced_samples)[1])
                    print("Balanced data shape: {}".format(np.shape(data)))
                    reduced_samples = data[:, :-1]
                    targets = data[:, -1]
                    unique, counts = np.unique(targets, return_counts=True)
                    print("Class breakdown: {}\n".format(
                        dict(zip(unique, counts))))
                    # ###########

                    # Splitting
                    # Split in train and test
                    train, test = preprocessing.split(reduced_samples, targets,
                                                      0.15)
                    # #########
                    for c in np.arange(.1, .9, .1):
                        # SVM fitting
                        clf = SVC(C=c,
                                  class_weight='balanced',
                                  verbose=0,
                                  probability=True)

                        clf.fit(train[:, :-1], train[:, -1])
                        score = clf.score(test[:, :-1], test[:, -1])

                        print(
                            "SVC score of fit on case study data: {}, with C: {}"
                            .format(score,
                                    clf.get_params()["C"]))

                        parameters = {
                            "algo": "SVC",
                            "kernel": clf.get_params()["kernel"],
                            "dataset": "keynote",
                            "tsne_perplexity": perplexity,
                            "tsne_learning_rate": l_r,
                            "tnse_dimensionality": dimensionality,
                            "svc_c": c,
                            "score": score,
                        }

                        logger.store_log_entry(
                            parameters,
                            "keynote_supervised_exploration_log.json")
            print()

    # Assume dimensionality reduction, classification exploration
    if (True):
        # Reduction
        print("Reduction")
        reduced_samples = tSNE_IO.load_or_reduce(samples, PERPLEXITY,
                                                 LEARNING_RATE, EXAGGERATION)
        print("Reduced samples shape: {}".format(np.shape(reduced_samples)))
        # ########

        # Class balancing
        data = np.hstack(
            (reduced_samples, np.reshape(targets, (len(targets), 1))))
        data, names = preprocessing.balance_classes(
            data, names,
            np.shape(reduced_samples)[1])
        print("Balanced data shape: {}".format(np.shape(data)))
        reduced_samples = data[:, :-1]
        targets = data[:, -1]
        unique, counts = np.unique(targets, return_counts=True)
        print("Class breakdown: {}\n".format(dict(zip(unique, counts))))
        # ###########

        # Split in train and test
        train, test = preprocessing.split(reduced_samples, targets, 0.15)
        # #########

        for c in np.arange(.1, 1, .1):
            # SVM fitting
            clf = SVC(C=c,
                      class_weight='balanced',
                      verbose=0,
                      probability=True)

            clf.fit(train[:, :-1], train[:, -1])
            score = clf.score(test[:, :-1], test[:, -1])

            print("SVC score of fit on case study data: {}, with C: {}".format(
                score,
                clf.get_params()["C"]))

            parameters = {
                "algo": "SVC",
                "kernel": clf.get_params()["kernel"],
                "dataset": "keynote",
                "tsne_perplexity": PERPLEXITY,
                "tsne_learning_rate": LEARNING_RATE,
                "tnse_dimensionality": 2,
                "svc_c": c,
                "score": score,
            }

            logger.store_log_entry(parameters,
                                   "keynote_supervised_exploration_log.json")
Пример #10
0
print('test data SVD finishes !\n')
three = time.clock()
print('time of SVD = %s' % (three - two))
#V_address = 'C:/Users/Chaomin/Desktop/data_mining/data/all_V_1000.npy'
#V_1000 = np.load((V_address))

V_address = cwd + '/data/temp/all_V_1000'
np.save(V_address, V_1000)

three = time.clock()

print('time of SVD = %s \n' % (three - two))

four = time.clock()
print('Spliting data to test_data and training data:\n')
X_train_ay, X_test_ay, X_train_index, X_test_index = preprocessing.split(
    X_data_matrix, 0.2, 1)
#np.shape(X_train_ay) == (80,13628) #np.shape(X_test_ay) == (20, 13628)
# The first column of X_train_ay and X_test_ay is the app_name, and the last column is the app_label
m_test, n_test = np.shape(X_test_ay)
m_train, n_train = np.shape(X_train_ay)
X_test_label = X_test_ay[:, n_test - 1]
X_train_label = X_train_ay[:, n_train - 1]
X_test_appname = X_test_ay[:, 0]
X_train_appname = X_train_ay[:, 0]
X_raw_test = X_test_ay[:, 1:n_test - 1]
X_raw_train = X_train_ay[:, 1:n_train - 1]

X_raw_test_address = cwd + '/data/temp/X_raw_test'
np.save(X_raw_test_address, X_raw_test)

X_test_label_address = cwd + '/data/temp/X_test_label'
Пример #11
0
    os.path.dirname(os.getcwd()) + "/data/price_data.csv")
blockchain_data = scraper.fetch_data(
    os.path.dirname(os.getcwd()) + "/data/blockchain_data.csv")
#coindesk_headlines = pd.read_csv(os.path.dirname(os.getcwd()) + "/data/test_scores.csv", sep=",")

# Preprocessing #

print("Preprocessing")

data = (
    price_data.pipe(pp.calculate_indicators).pipe(
        pp.merge_datasets,
        other_sets=[blockchain_data])  # [blockchain_data, coindesk_headlines]
    .pipe(pp.binarize_labels).pipe(pp.fix_null_vals).pipe(
        pp.add_lag_variables, lag=3).pipe(pp.power_transform))
x_train, x_test, y_train, y_test = pp.split(data, test_size=.2, balanced=True)

# Exploratory Analysis #

print("Analyzing features")

#print(data.describe())
analysis.plot_corr_matrix(data)

# Fitting Models #

print("Fitting models")

log_reg = Model(estimator="LogisticRegression",
                train_set=(x_train, y_train),
                test_set=(x_test, y_test),