def main(): data = utils.read_data_from_csv('data/winequality-red.csv') for attribute in data[0].keys(): for name, func in BIN_FUNCTIONS.iteritems(): plot_histogram(data, attribute, func, name) data_frame = DataFrame(data) plot_scatter_matrix(data_frame) plot_parallel_coordinates(data_frame) plot_pca_projection(data) plot_pca_projection(data, normalized = True) plot_mds(data) data_frame.corr(method='pearson').to_csv('build/pearson.csv') data_frame.corr(method='kendall').to_csv('build/kendall.csv')
def main(): data = utils.read_data_from_csv('data/winequality-red.csv') for attribute in data[0].keys(): for name, func in BIN_FUNCTIONS.iteritems(): plot_histogram(data, attribute, func, name) data_frame = DataFrame(data) plot_scatter_matrix(data_frame) plot_parallel_coordinates(data_frame) plot_pca_projection(data) plot_pca_projection(data, normalized=True) plot_mds(data) data_frame.corr(method='pearson').to_csv('build/pearson.csv') data_frame.corr(method='kendall').to_csv('build/kendall.csv')
import torch from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification, BertTokenizer from tqdm import trange import tarfile import config from utils import read_data_from_csv, prepare_data_bert if __name__ == '__main__': # train_data is the same thing as the train_data and test_data outputs from preprocess_data, just pickled # This helps avoid having to run the preprocess_data script everytime print("Loading data...") if config.EQUALIZE_CLASS_COUNTS is True: print("Equalizing class counts!") train_data = read_data_from_csv(filename=config.CSV_FILENAME_TRAIN, train=True, num_records=config.BERT_NUM_RECORDS, equalize=config.EQUALIZE_CLASS_COUNTS) print("Loading models...") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # initialize the model with 2 output classes model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) device = torch.device(config.DEVICE) model = model.to(device) # initialize the optimzier param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_params = [{ 'params':
print "In-sample variance: %f" % numpy.var(in_sample_errors) print "In-sample mean: %f" % numpy.mean(in_sample_errors) out_sample_errors = calculate_linear_errors(model, model.predict(X_test), Y_test) plot_errors(out_sample_errors, 'Absolute error (out-sample)') utils.save_plot(pyplot, name = "build/%s_out_sample.png" % type) print "Out-of-sample variance: %0.3f" % numpy.var(out_sample_errors) print "Out-of-sample mean: %0.3f" % numpy.mean(out_sample_errors) return (numpy.mean(out_sample_errors) + numpy.mean(in_sample_errors)) / 2 if __name__ == '__main__': dataset = utils.dict_to_numpy( utils.read_data_from_csv('data/winequality-red.csv'), columns_to_exclude = ['fixed acidity', 'chlorides', 'free sulfur dioxide']) data = dataset['data'] target = dataset['target'] attributes = dataset['attributes'] X_train = data[:-100] X_test = data[-100:] Y_train = target[:-100] Y_test = target[-100:] print 'Linear regression' regression_model = linear_model.LinearRegression() regression(regression_model, X_train, X_test, Y_train, Y_test, 'linear') print
import config from utils import read_data_from_csv, prepare_data_bert, print_evaluation_score def flat_accuracy(preds, labels): accuracy = np.sum(preds == labels) / len(labels) return accuracy if __name__ == '__main__': print("Loading data...") if config.EQUALIZE_CLASS_COUNTS is True: print("Equalized class counts!") test_data = read_data_from_csv(filename=config.CSV_FILENAME_TEST, train=False) print("Loading models...") device = torch.device(config.DEVICE) model = BertForSequenceClassification.from_pretrained(config.BERT_TAR_FILE, num_labels=2) tokenizer = BertTokenizer.from_pretrained(config.BERT_VOCAB_FILE, do_lower_case=True) print("Preparing testing data...") max_sent_len = config.BERT_MAX_SENT_LEN test_dataloader = prepare_data_bert(test_data, tokenizer, max_sent_len) print("Evaluating the model...") model.eval() model.to(device)