def single_word_file_predict(data_filename, predict_filename): print 'Predict file ' + data_filename sentence_list = [] words_list = [] labels_list = [] with open(data_filename, mode='r') as data_file: for line in data_file: word_list, label_list = data_utils.split(line) if word_list and label_list: sentence_list.append(''.join(word_list)) words_list.append(' '.join(word_list)) labels_list.append(' '.join(label_list)) predict_labels_list = words_predict(words_list) word_predict_label_list = [] word_category_list = [] word_predict_category_list = [] for (words, labels, predict_labels) in zip(words_list, labels_list, predict_labels_list): word_list = words.split() label_list = labels.split() predict_label_list = predict_labels.split() word_predict_label = ' '.join([ word + '/' + predict_label for (word, predict_label) in zip(word_list, predict_label_list) ]) word_predict_label_list.append(word_predict_label) # merge label merge_word_list, merge_label_list = data_utils.merge_label( word_list, label_list) word_category = ' '.join([ word + '/' + label for (word, label) in zip(merge_word_list, merge_label_list) if label != 'O' ]) word_category_list.append(word_category) # merge predict label merge_predict_word_list, merge_predict_label_list = data_utils.merge_label( word_list, predict_label_list) word_predict_category = ' '.join([ predict_word + '/' + predict_label for (predict_word, predict_label ) in zip(merge_predict_word_list, merge_predict_label_list) if predict_label != 'O' ]) word_predict_category_list.append(word_predict_category) with open(predict_filename, mode='w') as predict_file: for (sentence, word_predict_label, word_category, word_predict_category) in \ zip(sentence_list, word_predict_label_list, word_category_list, word_predict_category_list): predict_file.write('Passage: ' + sentence + '\n') predict_file.write('SinglePredict: ' + word_predict_label + '\n') predict_file.write('Merge: ' + word_category + '\n') predict_file.write('MergePredict: ' + word_predict_category + '\n') predict_file.write('\n')
def train_svm(dataset_loader, test_points, data_limit=0): input_, output = get_data_up_to_limit(dataset_loader, data_limit) input_, output = data_utils.construct_one_vs_all(input_, output, 0) (input_train, input_test, output_train, output_test) = data_utils.split(input_, output, test_points) #Run svm svm = SVM() svm.give_training_data(input_train, output_train) svm.train() svm.give_test_data(input_test, output_test) svm.analyze()
def file_predict(data_filename, predict_filename): print 'Predict file ' + data_filename words_list = [] labels_list = [] with open(data_filename, mode='r') as data_file: for line in data_file: word_list, label_list = data_utils.split(line) if word_list and label_list: words_list.append(' '.join(word_list)) labels_list.append(' '.join(label_list)) predict_labels_list = words_predict(words_list) with open(predict_filename, mode='w') as predict_file: for (words, labels, predict_labels) in zip(words_list, labels_list, predict_labels_list): predict_file.write('Passage: ' + words + '\n') predict_file.write('Label: ' + labels + '\n') predict_file.write('PredictLabel: ' + predict_labels + '\n' + '\n')
def main(): parser = argparse.ArgumentParser( description= 'Split train.csv into train, dev, and test splits. Specify dev and validation set sizes with args, the remainder is used for training.' ) parser.add_argument( '--dataset-file', required=True, help='path to the train.csv file containing the quora training data') parser.add_argument('--ndev', type=int, default=1e4, help='size of dev set to create') parser.add_argument('--nvalid', type=int, default=5e4, help='size of validation set to create') parser.add_argument( '--output-dir', required=True, help='directory to which to write train.csv, dev.csv, and valid.csv') parser.add_argument( '--seed', type=int, help= 'optional random seed to have reproducibility between multiple uses of this tool' ) args = parser.parse_args() data = du.load_csv(args.dataset_file) shuffled = du.shuffle(data, args.seed) ntrain = len(data) - args.ndev - args.nvalid train, dev, valid = du.split(shuffled, ntrain, args.ndev, args.nvalid) du.write_csv(train, os.path.join(args.output_dir, 'train.csv')) du.write_csv(dev, os.path.join(args.output_dir, 'dev.csv')) du.write_csv(valid, os.path.join(args.output_dir, 'valid.csv'))
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False): """ Compare samples from a model against training set and validation set in mmd """ if tstr: print('Loading data from TSTR experiment (not sampling from model)') # load pre-generated samples synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item() model_samples = synth_data['samples'] synth_labels = synth_data['labels'] # load real data used in that experiment real_data = np.load('./experiments/data/' + identifier + '.data.npy').item() real_samples = real_data['samples'] train = real_samples['train'] test = real_samples['test'] n_samples = test.shape[0] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation(model_samples)[:n_samples] print('Data loaded successfully!') else: if identifier == 'cristobal_eICU': model_samples = pickle.load(open('REDACTED', 'rb')) samples, labels = data_utils.eICU_task() train = samples['train'].reshape(-1, 16, 4) vali = samples['vali'].reshape(-1, 16, 4) test = samples['test'].reshape(-1, 16, 4) #train_targets = labels['train'] #vali_targets = labels['vali'] #test_targets = labels['test'] train, vali, test = data_utils.scale_data(train, vali, test) n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation( model_samples)[:n_samples] elif identifier == 'cristobal_MNIST': the_dir = 'REDACTED' # pick a random one which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7']) model_samples, model_labels = pickle.load( open( the_dir + 'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_' + which + '_190.pk', 'rb')) # get test and train... # (generated with fixed seed...) mnist_resized_dim = 14 samples, labels = data_utils.load_resized_mnist(mnist_resized_dim) proportions = [0.6, 0.2, 0.2] train, vali, test, labels_split = data_utils.split( samples, labels=labels, random_seed=1, proportions=proportions) np.random.seed() train = train.reshape(-1, 14, 14) test = test.reshape(-1, 14, 14) vali = vali.reshape(-1, 14, 14) n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation( model_samples)[:n_samples] else: settings = json.load( open('./experiments/settings/' + identifier + '.txt', 'r')) # get the test, train sets data = np.load('./experiments/data/' + identifier + '.data.npy').item() train = data['samples']['train'] test = data['samples']['test'] n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] model_samples = model.sample_trained_model(settings, epoch, n_samples) all_samples = np.vstack([train, test, model_samples]) heuristic_sigma = mmd.median_pairwise_distance(all_samples) print('heuristic sigma:', heuristic_sigma) pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test( model_samples, test, np.random.permutation(train)[:n_samples], sigma=heuristic_sigma, computeMMDs=False) #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False) # if pvalue < 0.05: # print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data') # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test) # else: # print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data') return pvalue, tstat, sigma
] data_settings = dict( (k, settings[k]) for k in data_vars if k in settings.keys()) samples, pdf, labels = data_utils.get_data(settings['data'], data_settings) if 'multivariate_mnist' in settings and settings['multivariate_mnist']: seq_length = samples.shape[1] samples = samples.reshape(-1, int(np.sqrt(seq_length)), int(np.sqrt(seq_length))) if 'normalise' in settings and settings[ 'normalise']: # TODO this is a mess, fix print(settings['normalise']) norm = True else: norm = False if labels is None: train, vali, test = data_utils.split(samples, [0.6, 0.2, 0.2], normalise=norm) train_labels, vali_labels, test_labels = None, None, None else: train, vali, test, labels_list = data_utils.split(samples, [0.6, 0.2, 0.2], normalise=norm, labels=labels) train_labels, vali_labels, test_labels = labels_list labels = dict() labels['train'], labels['vali'], labels[ 'test'] = train_labels, vali_labels, test_labels del train_labels del vali_labels del test_labels
transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) transforms_test = transforms.Compose([transforms.ToTensor(), normalize]) best_acc = 0 start_epoch = 0 trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=None) train_data, train_labels = trainset.data, np.squeeze(trainset.targets) unlabeled_idxs, labeled_idxs, _ = du.split(trainset, sn=5000, v_sn=0) train_datasets = DT(trainData=train_data[labeled_idxs, :, :, :], trainLabel=train_labels[labeled_idxs], transform=transforms_train) trainloader = torch.utils.data.DataLoader(train_datasets, batch_size=100, shuffle=True, num_workers=4) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms_test) testloader = torch.utils.data.DataLoader(testset,
samples = [] #Load the stock data by each row #Shape is num_samples * sequence_length(each stocks time series) * number of signal channels (high, open, close, low, volume at each time step) #Everything needs to be an np.ndarray for stock, df_stock in df.groupby('Name'): sequence = [] for index, row in df_stock.iterrows(): #print(list(row[1:6])) sequence.append(np.array(list(row[1:6]))) sequence = np.array(sequence) if (len(sequence) == 1259): samples.append(sequence) samples = np.array(samples) #split into train,vali,test split. Normalize/scale don't work correctly train, vali, test = data_utils.split(samples, [0.6, 0.2, 0.2], normalise=False) train_labels, vali_labels, test_labels = None, None, None #Check that values scale correctly print("Before scaling") print(train) for i in range(len(train)): train[i] = scale_linear_bycolumn(train[i]) for i in range(len(vali)): vali[i] = scale_linear_bycolumn(vali[i]) for i in range(len(test)): test[i] = scale_linear_bycolumn(test[i]) print("After scaling") print(train) labels = dict() labels['train'], labels['vali'], labels[ 'test'] = train_labels, vali_labels, test_labels