def conduct_auto_sklearn(experimentdatafile,resamplingstrategy,resamplingstrategyarguments,timefortask): # Load data files lunchbox = pickle.load(open(experimentdatafile,"rb")) # Set up autosklearn and run against Pythia feature sets clf = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=timefortask, per_run_time_limit=360, \ initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50, seed=1, \ ml_memory_limit=3000, include_estimators=None, include_preprocessors=None, \ resampling_strategy=resamplingstrategy, resampling_strategy_arguments=resamplingstrategyarguments, \ tmp_folder=None, output_folder=None, delete_tmp_folder_after_terminate=True, \ delete_output_folder_after_terminate=True, shared_mode=False) # TODO Find way to suppress voluminous INFO messages from autosklearn clf.fit(numpy.asarray(lunchbox['train_data']), numpy.asarray(lunchbox['train_target'])) # Print autoasklearn results print("Models",clf.show_models(), file=sys.stderr) # Get performance metrics of autolearn models against testing data predictions = clf.predict(numpy.asarray(lunchbox['test_data'])) performresults = performance_metrics.get_perform_metrics(numpy.asarray(lunchbox['test_target']), predictions) # Fill results dictionary to return to Sacred for logging results = dict() results['autosklearn_models'] = clf.show_models() results['autosklearn_perform_results'] = performresults results['directory'] = lunchbox['directory'] results['features'] = lunchbox['features'] results['algorithms'] = lunchbox['algorithms'] results['parameters'] = lunchbox['parameters'] return results
def run_model(train_data, train_labels, test_data, test_labels): ''' Algorithm which will take in a set of training text and labels to train a bag of words model This model is then used with a logistic regression algorithm to predict the labels for a second set of text Method modified from code available at: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words Args: train_data_text: Text training set. Needs to be iterable train_labels: Training set labels test_data_text: The text to Returns: pred_labels: The predicted labels as determined by logistic regression ''' #use Logistic Regression to train a model logreg = linear_model.LogisticRegression(C=1e5) # we create an instance of Neighbours Classifier and fit the data. logreg.fit(train_data, train_labels) #Now that we have something trained we can check if it is accurate with the test set pred_labels = logreg.predict(test_data) perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels) #Perform_results is a dictionary, so we should add other pertinent information to the run perform_results['vector'] = 'Bag_of_Words' perform_results['alg'] = 'Logistic_Regression' return pred_labels, perform_results
def run_model(train_data, train_labels, test_data, test_labels): ''' Algorithm which will take in a set of training text and labels to train a bag of words model This model is then used with a logistic regression algorithm to predict the labels for a second set of text Method modified from code available at: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words Args: train_data_text: Text training set. Needs to be iterable train_labels: Training set labels test_data_text: The text to Returns: pred_labels: The predicted labels as determined by logistic regression ''' #use Logistic Regression to train a model logreg = linear_model.LogisticRegression(C=1e5) # we create an instance of Neighbours Classifier and fit the data. logreg.fit(train_data, train_labels) #Now that we have something trained we can check if it is accurate with the test set pred_labels = logreg.predict(test_data) perform_results = performance_metrics.get_perform_metrics( test_labels, pred_labels) #Perform_results is a dictionary, so we should add other pertinent information to the run perform_results['vector'] = 'Bag_of_Words' perform_results['alg'] = 'Logistic_Regression' return pred_labels, perform_results
def predicter(classifier, test_data, test_labels): # Handle HDF5 case if type(test_data) is str: assert test_data==test_labels with h5py.File(test_data) as f: test_data = f['data'][()] test_labels = f['labels'][()] pred_labels = classifier.predict(test_data) perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels) return pred_labels, perform_results
def predicter(classifier, test_data, test_labels): # Handle HDF5 case if type(test_data) is str: assert test_data == test_labels with h5py.File(test_data) as f: test_data = f['data'][()] test_labels = f['labels'][()] pred_labels = classifier.predict(test_data) perform_results = performance_metrics.get_perform_metrics( test_labels, pred_labels) return pred_labels, perform_results
def conduct_auto_sklearn(experimentdatafile, resamplingstrategy, resamplingstrategyarguments, timefortask): # Load data files lunchbox = pickle.load(open(experimentdatafile, "rb")) # Set up autosklearn and run against Pythia feature sets clf = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=timefortask, per_run_time_limit=360, \ initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50, seed=1, \ ml_memory_limit=3000, include_estimators=None, include_preprocessors=None, \ resampling_strategy=resamplingstrategy, resampling_strategy_arguments=resamplingstrategyarguments, \ tmp_folder=None, output_folder=None, delete_tmp_folder_after_terminate=True, \ delete_output_folder_after_terminate=True, shared_mode=False) # TODO Find way to suppress voluminous INFO messages from autosklearn clf.fit(numpy.asarray(lunchbox['train_data']), numpy.asarray(lunchbox['train_target'])) # Print autoasklearn results print("Models", clf.show_models(), file=sys.stderr) # Get performance metrics of autolearn models against testing data predictions = clf.predict(numpy.asarray(lunchbox['test_data'])) performresults = performance_metrics.get_perform_metrics( numpy.asarray(lunchbox['test_target']), predictions) # Fill results dictionary to return to Sacred for logging results = dict() results['autosklearn_models'] = clf.show_models() results['autosklearn_perform_results'] = performresults results['directory'] = lunchbox['directory'] results['features'] = lunchbox['features'] results['algorithms'] = lunchbox['algorithms'] results['parameters'] = lunchbox['parameters'] return results
def do_epoch(dmn, mode, epoch, batch_size, log_every, skipped=0): ''' This function runs the epochs for the training of the neural network. It calls the steps in the epoch and allows for the tweaking of the parameter values. Args: mode (str): 'train' or 'test' for whether this is the training or testing step epoch (int): the epoch number currently on skipped (int): how many steps have been skipped because of no change in gradient Returns: avg_loss (double): the new calculated average loss skipped (int): how many steps skipped since the beginning of running the epoch added to the previous skip value ''' # mode is 'train' or 'test' y_true = [] y_pred = [] avg_loss = 0.0 prev_time = time.time() batches_per_epoch = dmn.get_batches_per_epoch(mode) #if batches_per_epoch==0: batches_per_epoch=10 print(batches_per_epoch, dmn) for i in range(0, batches_per_epoch): step_data = dmn.step(i, mode) # Run step using the dynamic memory network object prediction = step_data["prediction"] answers = step_data["answers"] current_loss = step_data["current_loss"] current_skip = (step_data["skipped"] if "skipped" in step_data else 0) log = step_data["log"] skipped += current_skip if current_skip == 0: avg_loss += current_loss for x in answers: y_true.append(x) for x in prediction.argmax(axis=1): #some predictions are not 0,1 for the first couple of guesses #TODO figure out why...but until then this catches the issue if x not in [0,1]: x = np.random.randint(0,2) y_pred.append(x) # TODO: save the state sometimes if (i % log_every == 0): cur_time = time.time() print (" %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t skipped: %d \t %s \t time: %.2fs" % (mode, epoch, i * batch_size, batches_per_epoch * batch_size, current_loss, avg_loss / (i + 1), skipped, log, cur_time - prev_time)) prev_time = cur_time if np.isnan(current_loss): print("==> current loss IS NaN. This should never happen :) " ) exit() avg_loss /= batches_per_epoch print("\n %s loss = %.5f" % (mode, avg_loss)) print("confusion matrix:") print(metrics.confusion_matrix(y_true, y_pred)) perform_results = performance_metrics.get_perform_metrics(y_true, y_pred) print(perform_results) accuracy = sum([1 if t == p else 0 for t, p in zip(y_true, y_pred)]) print("accuracy: %.2f percent" % (accuracy * 100.0 / batches_per_epoch / batch_size)) return avg_loss, skipped, perform_results, y_pred
def predicter(classifier, test_data, test_labels): pred_labels = classifier.predict(test_data) perform_results = performance_metrics.get_perform_metrics( test_labels, pred_labels) return pred_labels, perform_results
def predicter(classifier, test_data, test_labels): pred_labels = classifier.predict(test_data) perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels) return pred_labels, perform_results