def test_loading(): # test multivariate # Test univariate for i in range(0, len(dataset_lists.univariate)): data_dir = "E:/tsc_ts/" dataset = dataset_lists.univariate[i] trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") print("Loaded " + dataset + " in position " + str(i)) print("Train X shape :") print(trainX.shape) print("Train Y shape :") print(trainY.shape) print("Test X shape :") print(testX.shape) print("Test Y shape :") print(testY.shape) for i in range(16, len(dataset_lists.multivariate)): data_dir = "E:/mtsc_ts/" dataset = dataset_lists.multivariate[i] print("Loading " + dataset + " in position " + str(i) + ".......") trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") print("Loaded " + dataset) print("Train X shape :") print(trainX.shape) print("Train Y shape :") print(trainY.shape) print("Test X shape :") print(testX.shape) print("Test Y shape :") print(testY.shape)
def demo_loading(): """Test function to check dataset loading of univariate and multivaria problems.""" for i in range(0, len(dataset_lists.univariate)): data_dir = "E:/tsc_ts/" dataset = dataset_lists.univariate[i] trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") print("Loaded " + dataset + " in position " + str(i)) print("Train X shape :") print(trainX.shape) print("Train Y shape :") print(trainY.shape) print("Test X shape :") print(testX.shape) print("Test Y shape :") print(testY.shape) for i in range(16, len(dataset_lists.multivariate)): data_dir = "E:/mtsc_ts/" dataset = dataset_lists.multivariate[i] print("Loading " + dataset + " in position " + str(i) + ".......") trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") print("Loaded " + dataset) print("Train X shape :") print(trainX.shape) print("Train Y shape :") print(trainY.shape) print("Test X shape :") print(testX.shape) print("Test Y shape :") print(testY.shape)
def test_loading(): # test multivariate # Test univariate data_dir = "E:/tsc_ts/" dataset = "Gunpoint" trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") print("Loaded " + dataset + " in position " + str(i))
def run_experiment( problem_path, results_path, cls_name, dataset, classifier=None, resampleID=0, overwrite=False, format=".ts", train_file=False, ): """ Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. :param problem_path: Location of problem files, full path. :param results_path: Location of where to write results. Any required directories will be created :param cls_name: determines which classifier to use, as defined in set_classifier. This assumes predict_proba is implemented, to avoid predicting twice. May break some classifiers though :param dataset: Name of problem. Files must be <problem_path>/<dataset>/<dataset>+"_TRAIN"+format, same for "_TEST" :param resampleID: Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. :param overwrite: if set to False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there :param format: Valid formats are ".ts", ".arff" and ".long". For more info on format, see https://github.com/alan-turing-institute/sktime/blob/master/examples/Loading%20Data%20Examples.ipynb :param train_file: whether to generate train files or not. If true, it performs a 10xCV on the train and saves :return: """ build_test = True if not overwrite: full_path = (str(results_path) + "/" + str(cls_name) + "/Predictions/" + str(dataset) + "/testFold" + str(resampleID) + ".csv") if os.path.exists(full_path): print( full_path + " Already exists and overwrite set to false, not building Test" ) build_test = False if train_file: full_path = (str(results_path) + "/" + str(cls_name) + "/Predictions/" + str(dataset) + "/trainFold" + str(resampleID) + ".csv") if os.path.exists(full_path): print( full_path + " Already exists and overwrite set to false, not building Train" ) train_file = False if train_file == False and build_test == False: return # TO DO: Automatically differentiate between problem types, currently only works with .ts trainX, trainY = load_ts(problem_path + dataset + "/" + dataset + "_TRAIN" + format) testX, testY = load_ts(problem_path + dataset + "/" + dataset + "_TEST" + format) trainX = _normalise_X(trainX) testX = _normalise_X(testX) if resampleID != 0: # allLabels = np.concatenate((trainY, testY), axis = None) # allData = pd.concat([trainX, testX]) # train_size = len(trainY) / (len(trainY) + len(testY)) # trainX, testX, trainY, testY = train_test_split(allData, allLabels, train_size=train_size, # random_state=resampleID, shuffle=True, # stratify=allLabels) trainX, trainY, testX, testY = stratified_resample( trainX, trainY, testX, testY, resampleID) le = preprocessing.LabelEncoder() le.fit(trainY) trainY = le.transform(trainY) testY = le.transform(testY) if classifier is None: classifier = set_classifier(cls_name, resampleID) print(cls_name + " on " + dataset + " resample number " + str(resampleID)) if build_test: # TO DO : use sklearn CV start = int(round(time.time() * 1000)) classifier.fit(trainX, trainY) build_time = int(round(time.time() * 1000)) - start start = int(round(time.time() * 1000)) probs = classifier.predict_proba(testX) preds = classifier.classes_[np.argmax(probs, axis=1)] test_time = int(round(time.time() * 1000)) - start ac = accuracy_score(testY, preds) print(cls_name + " on " + dataset + " resample number " + str(resampleID) + " test acc: " + str(ac) + " time: " + str(test_time)) # print(str(classifier.findEnsembleTrainAcc(trainX, trainY))) if "Composite" in cls_name: second = "Para info too long!" else: second = str(classifier.get_params()) second.replace("\n", " ") second.replace("\r", " ") print(second) temp = np.array_repr(classifier.classes_).replace("\n", "") third = (str(ac) + "," + str(build_time) + "," + str(test_time) + ",-1,-1," + str(len(classifier.classes_))) write_results_to_uea_format( second_line=second, third_line=third, output_path=results_path, classifier_name=cls_name, resample_seed=resampleID, predicted_class_vals=preds, actual_probas=probs, dataset_name=dataset, actual_class_vals=testY, split="TEST", ) if train_file: start = int(round(time.time() * 1000)) if build_test and hasattr( classifier, "_get_train_probs" ): # Normally Can only do this if test has been built ... well not necessarily true, but will do for now train_probs = classifier._get_train_probs(trainX) else: train_probs = cross_val_predict(classifier, X=trainX, y=trainY, cv=10, method="predict_proba") train_time = int(round(time.time() * 1000)) - start train_preds = classifier.classes_[np.argmax(train_probs, axis=1)] train_acc = accuracy_score(trainY, train_preds) print(cls_name + " on " + dataset + " resample number " + str(resampleID) + " train acc: " + str(train_acc) + " time: " + str(train_time)) if "Composite" in cls_name: second = "Para info too long!" else: second = str(classifier.get_params()) second.replace("\n", " ") second.replace("\r", " ") temp = np.array_repr(classifier.classes_).replace("\n", "") third = (str(train_acc) + "," + str(train_time) + ",-1,-1,-1," + str(len(classifier.classes_))) write_results_to_uea_format( second_line=second, third_line=third, output_path=results_path, classifier_name=cls_name, resample_seed=resampleID, predicted_class_vals=train_preds, actual_probas=train_probs, dataset_name=dataset, actual_class_vals=trainY, split="TRAIN", )
results_path=results_dir, cls_name=classifier, dataset=dataset, resampleID=resample, train_file=tf, ) else: # Local run # data_dir = "/scratch/univariate_datasets/" # results_dir = "/scratch/results" data_dir = "/bench/datasets/Univariate2018/" results_dir = "C:/Users/ajb/Dropbox/Turing Project/Results/" # data_dir = "Z:/ArchiveData/Univariate_ts/" # results_dir = "E:/Temp/" # results_dir = "Z:/Results/sktime Bakeoff/" dataset = "ItalyPowerDemand" trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") classifier = "TSF" resample = 1 # for i in range(0, len(univariate_datasets)): # dataset = univariate_datasets[i] # # print(i) # # print(" problem = "+dataset) tf = False run_experiment( overwrite=True, problem_path=data_dir, results_path=results_dir, cls_name=classifier, dataset=dataset, resampleID=resample,
def load_and_run_classification_experiment( problem_path, results_path, cls_name, dataset, classifier=None, resample_id=0, overwrite=False, build_train=False, predefined_resample=False, ): """Load a dataset and run a classification experiment. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. Parameters ---------- problem_path : str Location of problem files, full path. results_path : str Location of where to write results. Any required directories will be created. cls_name : str Determines which classifier to use, as defined in set_classifier. This assumes predict_proba is implemented, to avoid predicting twice. May break some classifiers though. dataset : str Name of problem. Files must be <problem_path>/<dataset>/<dataset>+"_TRAIN.ts", same for "_TEST". classifier : BaseClassifier, default=None Classifier to be used in the experiment, if none is provided one is selected using cls_name using resample_id as a seed. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. overwrite : bool, default=False If set to False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there. build_train : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold cross-validation on the train data and saves. If the classifier can produce its own estimates, those are used instead. predefined_resample : bool, default=False Read a predefined resample from file instead of performing a resample. If True the file format must include the resample_id at the end of the dataset name i.e. <problem_path>/<dataset>/<dataset>+<resample_id>+"_TRAIN.ts". """ # Check which files exist, if both exist, exit build_test = True if not overwrite: full_path = ( results_path + "/" + cls_name + "/Predictions/" + dataset + "/testResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): build_test = False if build_train: full_path = ( results_path + "/" + cls_name + "/Predictions/" + dataset + "/trainResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): build_train = False if build_train is False and build_test is False: return if predefined_resample: X_train, y_train = load_ts( problem_path + dataset + "/" + dataset + str(resample_id) + "_TRAIN.ts" ) X_test, y_test = load_ts( problem_path + dataset + "/" + dataset + str(resample_id) + "_TEST.ts" ) else: X_train, y_train = load_ts(problem_path + dataset + "/" + dataset + "_TRAIN.ts") X_test, y_test = load_ts(problem_path + dataset + "/" + dataset + "_TEST.ts") if resample_id != 0: X_train, y_train, X_test, y_test = stratified_resample( X_train, y_train, X_test, y_test, resample_id ) if classifier is None: classifier = set_classifier(cls_name, resample_id, build_train) run_classification_experiment( X_train, y_train, X_test, y_test, classifier, results_path, cls_name=cls_name, dataset=dataset, resample_id=resample_id, train_file=build_train, test_file=build_test, )
def load_and_run_clustering_experiment( problem_path, results_path, cls_name, dataset, clusterer=None, resample_id=0, overwrite=False, format=".ts", train_file=False, ): """Run a clustering experiment. Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. This version loads the data from file based on a path. The clusterer is always trained on the Parameters ---------- problem_path : str Location of problem files, full path. results_path : str Location of where to write results. Any required directories will be created cls_name : str determines which clusterer to use if clusterer is None. In this case, set_clusterer is called with this cls_name dataset : str Name of problem. Files must be <problem_path>/<dataset>/<dataset>+ "_TRAIN"+format, same for "_TEST" resample_id : int, default = 0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. overwrite : boolean, default = False if False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there. format: string, default = ".ts" Valid formats are ".ts", ".arff", ".tsv" and ".long". For more info on format, see examples/loading_data.ipynb train_file: boolean, default = False whether to generate train files or not. If true, it performs a 10xCV on the train and saves """ # Set up the file path in standard format if not overwrite: full_path = ( str(results_path) + "/" + str(cls_name) + "/Predictions/" + str(dataset) + "/testResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): build_test = False if train_file: full_path = ( str(results_path) + "/" + str(cls_name) + "/Predictions/" + str(dataset) + "/trainResample" + str(resample_id) + ".csv" ) if os.path.exists(full_path): train_file = False if train_file is False and build_test is False: return # currently only works with .ts trainX, trainY = load_ts(problem_path + dataset + "/" + dataset + "_TRAIN" + format) testX, testY = load_ts(problem_path + dataset + "/" + dataset + "_TEST" + format) if resample_id != 0: trainX, trainY, testX, testY = stratified_resample( trainX, trainY, testX, testY, resample_id ) le = preprocessing.LabelEncoder() le.fit(trainY) trainY = le.transform(trainY) testY = le.transform(testY) if clusterer is None: clusterer = set_clusterer(cls_name, resample_id) run_clustering_experiment( trainX, clusterer, trainY=trainY, testX=testX, testY=testY, cls_name=cls_name, dataset_name=dataset, results_path=results_path, )
data_dir = "../datasets/data/" results_dir = "C:/Temp/Clusterers/" dataset = "UnitTest" clusterer = "kmeans" resample = 0 tf = True clst = TimeSeriesKMeans(n_clusters=2) load_and_run_clustering_experiment( overwrite=True, problem_path=data_dir, results_path=results_dir, cls_name=clusterer, dataset=dataset, resample_id=resample, train_file=tf, clusterer=clst, ) train_X, train_Y = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") test_X, test_Y = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") run_clustering_experiment( train_X, clst, results_path=results_dir + "Temp/", trainY=train_Y, testX=test_X, testY=test_Y, cls_name=clusterer, )