def format_dataset(formatted_dataset_dir=DEFAULT_FORMATTED_DATATSET_DIR, log_file=io.StringIO()): dataset, labels, label_map = load_dataset() print("randomizing the dataset...", file=log_file) print("train_test_split the dataset...", file=log_file) train_data, test_data, train_labels, test_labels = train_test_split(dataset, labels) print("reformating the dataset...", file=log_file) train_data, train_labels = _format_dataset(train_data, train_labels, IMAGE_SIZE, len(label_map)) test_data, test_labels = _format_dataset(test_data, test_labels, IMAGE_SIZE, len(label_map)) print("train_data:", train_data.shape, file=log_file) print("train_labels:", train_labels.shape, file=log_file) print("test_data:", test_data.shape, file=log_file) print("test_labels:", test_labels.shape, file=log_file) print("pickling the dataset...", file=log_file) formatted_train_dataset_path = os.path.join(formatted_dataset_dir, 'train_dataset.pickle') train_dataset = DataSet(train_data, train_labels, label_map) with open(formatted_train_dataset_path, 'wb') as f: pickle.dump(train_dataset, f, protocol=2) # for compatible with python27 formatted_test_dataset_path = os.path.join(formatted_dataset_dir, 'test_dataset.pickle') test_dataset = DataSet(test_data, test_labels, label_map) with open(formatted_test_dataset_path, 'wb') as f: pickle.dump(test_dataset, f, protocol=2) label_map_path = os.path.join(formatted_dataset_dir, 'label_map.pickle') with open(label_map_path, 'wb') as f2: pickle.dump(label_map, f2, protocol=2) print("dataset has saved at %s" % formatted_dataset_dir, file=log_file) print("load_model has finished", file=log_file)
def Train_test(all_data, test_percentage): # This function will divide the test and train data sets # Note that the test data set is used as validation and predict data used as the pure test N_rows = all_data.labels.shape[0] N_test = int(N_rows * test_percentage) N_months = ceil(N_rows / 30) # Number of periods step = int(N_test / N_months) # number of test samples from each period Train_inx = [] Test_inx = [] for i in range(0, N_rows - 30, 30): Train_inx.extend(range(i, i + 30 - step)) Test_inx.extend(range(i + 30 - step, i + 30)) # As the last period is less than one full month we should select the test equal to step size and leave the rest # for the train if N_rows - (i + 30) > step: Train_inx.extend(range(i + 30, N_rows - step)) Test_inx.extend(range(N_rows - step, N_rows)) else: Test_inx.extend(range(i + 30, N_rows)) Train = DataSet(all_data.features[Train_inx], all_data.labels[Train_inx], all_data.date[Train_inx]) Train.label_max = all_data.label_max Train.label_min = all_data.label_min Test = DataSet(all_data.features[Test_inx], all_data.labels[Test_inx], all_data.date[Test_inx]) return Train, Test
def do_all_tests(theIndexes, searchRatio): dataSets = [ DataSet('DATASETS/DATASET1.TXT'), DataSet('DATASETS/DATASET2.TXT'), DataSet('DATASETS/DATASET3.TXT') ] allStats = [] theTester = Tester() theModel = ModelWrapper() print('[[[[ STARTING THE MOTHER OF ALL TESTS ]]]]') for useCNN in [False, True]: print('[[[ ONLY CNN LAYERS ' + str(useCNN).upper() + ' ]]]') for curIndex in theIndexes: print('[[ TESTING MODEL ' + curIndex[0] + ' WITH TEST SET ' + str(curIndex[1] + 1) + ' ]]') theModel.load(curIndex[0]) theTester.set_params(theModel, dataSets[curIndex[1]]) curStats = theTester.compute_fullstats(useCNN=useCNN, searchRatio=searchRatio) allStats.append(curStats) print('[[ MODEL TESTED ]]') with open('ALLSTATS_PCT' + str(int(searchRatio * 100)) + '.pkl', 'wb') as outFile: dump(allStats, outFile) print('[[[ FINISHED ONLY CNN LAYERS ' + str(useCNN).upper() + ' ]]]') print('[[[[ FINISHED THE MOTHER OF ALL TESTS ]]]]')
def init_model(fold, train_data, train_label, val_data, val_label, test_data, test_label): train_source = DataSet(train_data, train_label) val_source = DataSet(val_data, val_label) test_source = DataSet(test_data, test_label) print('train_len:', len(train_source)) print('test_len:', len(test_source)) _lr = 1e-4 print('Initialize lr as %f' % _lr) model_config = { 'dout': True, 'lr': _lr, 'num_classes': 2, 'num_workers': 8, 'batch_size': 64, 'restore_iter': 0, 'total_iter': 5000, 'model_name': 'MGH-dw-all-' + fold, 'pretrain_point': None, 'train_source': train_source, 'val_source': val_source, 'test_source': test_source } model_config['save_name'] = '_'.join([ '{}'.format(model_config['model_name']), '{}'.format(model_config['dout']), '{}'.format(0.0001), '{}'.format(model_config['batch_size']), ]) os.makedirs(osp.join('model', model_config['model_name']), exist_ok=True) return Model(**model_config)
def dwi_philips(dataset): tag_bval = Tag(0x2001, 0x1003) tag_bvec = Tag(0x2001, 0x1004) tag_bvec_rl = Tag(0x2005, 0x10b0) tag_bvec_ap = Tag(0x2005, 0x10b1) tag_bvec_fh = Tag(0x2005, 0x10b2) if not all(x in dataset for x in (tag_bval, tag_bvec)): return None dwi_dataset = DataSet() if isinstance(dataset[tag_bval].value, (list, tuple)) and dataset[tag_bval].value: dwi_dataset.diffusion_bvalue = FD(dataset[tag_bval].value[0]) else: dwi_dataset.diffusion_bvalue = FD(dataset[tag_bval].value) gradient_dataset = DataSet() if not isinstance(dataset[tag_bvec], CS): gradient_dataset.diffusion_gradient_orientation = FD( [float(x) for x in dataset[tag_bvec].value]) else: gradient_dataset.diffusion_gradient_orientation = FD([ float(dataset[x].value) for x in (tag_bvec_rl, tag_bvec_ap, tag_bvec_fh) ]) dwi_dataset.diffusion_gradient_direction_sequence = SQ( [gradient_dataset]) dwi_dataset.diffusion_directionality = CS("DIRECTIONAL") return dwi_dataset
def read_train_sets(train_path, image_size, classes, validation_size): data_set = DataSet() images, labels, img_names, class_array = load_train_data( train_path, image_size, classes) images, labels, img_names, class_array = shuffle(images, labels, img_names, class_array) if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) validation_images = images[:validation_size] validation_labels = labels[:validation_size] validation_img_names = img_names[:validation_size] validation_cls = class_array[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_img_names = img_names[validation_size:] train_cls = class_array[validation_size:] data_set.train = DataSet(train_images, train_labels, train_img_names, train_cls) data_set.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls) return data_set
def __init__(self, sess, epoch, batch_size, checkpoint_dir, log_dir, learning_rate = 0.00001, beta1=0.5): self.sess = sess self.keep_prob = 1.0 #self.dataset_name = dataset_name #self.result_dir = result_dir self.log_dir = log_dir self.checkpoint_dir = checkpoint_dir self.epoch = epoch self.batch_size = batch_size self.beta1 = beta1 self.label_dim = 50 self.train_set = DataSet("../data/train_augment", self.batch_size, self.label_dim) self.test_set = DataSet("../data/test_augment", self.batch_size, self.label_dim) # parameters self.input_height = 227 self.input_width = 227 #self.output_height = 224 #self.output_width = 224 self.c_dim = 3 # train self.init_learning_rate = learning_rate # get number of batches for a single epoch self.num_batches = self.train_set.total_batches self.test_num_batches = self.test_set.total_batches
def train(): #训练数据 data_train, label_train = DataSet.data_from_text("./Hnd/trainyny.txt",1450) train = DataSet(data_train, label_train, dtype=dtypes.float32) data_test, label_test = DataSet.data_from_text("./Hnd/testyny.txt",145) test = DataSet(data_test, label_test, dtype=dtypes.float32) Datasetsx = collections.namedtuple('Datasetsx', ['train', 'test']) Data = Datasetsx(train=train, test=test) #训练过程 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver({'s_w': W, 's_b': b}) for i in range(50000): #训练阶段,迭代50000次 batch_xs, batch_ys = Data.train.next_batch(50) #按批次训练,每批50行数据 sess.run(train_step, feed_dict={x: batch_xs, y_actual: batch_ys}) #执行训练 accu = 0 if(i%50==0): #每训练100次,测试一次 accu = sess.run(accuracy, feed_dict={ x: Data.test.images, y_actual: Data.test.labels}) print ("accuracy:", accu) if(accu>Target_Accuracy): break saver.save(sess, "./model/softmax.ckpt")
def choose_dataset(): """Lets the user select a data set by keyboard input.""" val = input( 'Type "r" for restaurant data set, "p" for plants data set, ' '"b" for books data set, or anything else for business data set: ') if val == 'r': size = int(input('How many examples do you want to use? ')) return SyntheticRestaurant(size) elif val == 'p': dataset = DataSet( attr_names= 'Habitat Colour TypeOfLeaf LeafWidth LeafLength Height EdibleOrPoisonous', name='plants', source= 'http://mldata.org/repository/data/viewslug/plant-classification') elif val == 'b': dataset = DataSet( attr_names= 'Genre MenBuyers WomenBuyers Price CriticismRate ? LikedByAudience', name='books', source= 'http://mldata.org/repository/data/viewslug/book-evaluation-complete' ) else: dataset = DataSet( attr_names='X1 X2 X3 X4 X5 Successful', name='business', source= 'http://mldata.org/repository/data/viewslug/successful-business') return choose_size(dataset)
def dataset_reshaped(data_sets): train_images=data_sets.train.x train_images=train_images.reshape(train_images.shape[0],28,28,1) train_labels=data_sets.train.labels n_values = np.max(train_labels) + 1 train_labels=np.eye(n_values)[train_labels] validation_images=data_sets.validation.x validation_images=validation_images.reshape(validation_images.shape[0],28,28,1) validation_labels=data_sets.validation.labels n_values = np.max(validation_labels) + 1 validation_labels=np.eye(n_values)[validation_labels] test_images=data_sets.test.x test_images=test_images.reshape(test_images.shape[0],28,28,1) test_labels=data_sets.test.labels n_values = np.max(test_labels) + 1 test_labels=np.eye(n_values)[test_labels] train = DataSet(train_images, train_labels,size_change=True) validation = DataSet(validation_images, validation_labels,size_change=True) test = DataSet(test_images, test_labels,size_change=True) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, seed=0): one_hot = False class DataSets(object): pass data_sets = DataSets() TRAIN_IMAGES = "train-images-idx3-ubyte.gz" TRAIN_LABELS = "train-labels-idx1-ubyte.gz" TEST_IMAGES = "t10k-images-idx3-ubyte.gz" TEST_LABELS = "t10k-labels-idx1-ubyte.gz" local_file = maybe_download(TRAIN_IMAGES, train_dir) train_images = extract_images(local_file) local_file = maybe_download(TRAIN_LABELS, train_dir) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = maybe_download(TEST_IMAGES, train_dir) test_images = extract_images(local_file) local_file = maybe_download(TEST_LABELS, train_dir) test_labels = extract_labels(local_file, one_hot=one_hot) print('Train', train_images.shape) print('Test', test_images.shape) data_sets.train = DataSet(train_images, train_labels, seed=seed) data_sets.test = DataSet(test_images, test_labels, seed=seed) return data_sets
def generate_folds(self, data_set, num_fold): """ Generate smaller, non-overlapping data sets from the master data set. The folds should be returned in a list of tuples. Each tuple corresponds to a fold. t[0] is the testing fold of the data , t[1] is the inverse of the fold (training fold: all the rest of the data). """ items = data_set.get_items() folds = [] if (items is not None): for i in range(num_fold): # Each fold is a tuple of datasets # First one is the test fold, # The inverse fold is used for training folds.append((DataSet(), DataSet())) current_fold = 0 for item in items: folds[current_fold][0].add_item(item, data_set.get_features(item), data_set.get_label(item)) for j in range(num_fold): if (j <> current_fold): folds[j][1].add_item(item, data_set.get_features(item), data_set.get_label(item)) current_fold += 1 if (current_fold == num_fold): current_fold = 0 return folds
def main(args): # here can be replaced with argparse steps_per_epoch = _DATASET_SIZE / args.batch_size mfcc_data = DataSet(args.mfcc_dir) label_data = DataSet(args.label_dir) _, coefficient_vector_size, num_of_window = mfcc_data.shape() _, degree_of_latent_factor = label_data.shape() use_channel = False if args.model == 'conv1d': model = conv1d(num_of_window, coefficient_vector_size, degree_of_latent_factor) elif args.model == 'conv2d': use_channel = True model = conv2d(num_of_window, coefficient_vector_size, degree_of_latent_factor) elif args.model == 'conv1d_lstm': model = conv_lstm(num_of_window, coefficient_vector_size, degree_of_latent_factor) else: model = feed_forward(num_of_window, coefficient_vector_size, degree_of_latent_factor) trained_model, history = train(model, mfcc_data, label_data, use_channel, args.test_ratio, args.batch_size, steps_per_epoch, args.epochs) export(args.result, trained_model, history)
def main(): algorithms = [ factory.get_algorithm("id3"), factory.get_algorithm("knn"), factory.get_algorithm("bayes") ] training = DataSet() target = training.load_from_file("train.txt") validation = DataSet() validation.load_from_file("test.txt") output_file = "output.txt" # run the algorithms acc = [] for i in range(len(algorithms)): accuracy = tests.validate(algorithms[i], training, validation, target) # print specifically the tree algorithm if i == 0: algorithms[i].print_tree(output_file) acc.append(str(accuracy)) # write the accuracy into the files accuracy_string = '\n' + '\t'.join(acc) with open(output_file, 'a') as acc_file: acc_file.write(accuracy_string)
def create_pointer_examples(): results = [] result_names = [] # pure BPEmb vs = 100000 d = 200 bp_man = BPEmbeddings(bp_vocab_size=vs, dim=d, case_sensitive=False) ds = DataSet("blah") ds.read_multiple(train_sets + dev_set + itac_test + conll_test) bp_man.build_vocabulary([ds]) manager = PointerManager(bp_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager.load_model("pointer/models/19_05_11b/bpemb_{}_{}.pt".format(vs, d)) results.append(test_example(manager)) result_names.append("bpemb_{}_{}".format(vs, d)) # pure glove for d in [50, 300]: path = "embeddings/glove/glove.6B.{}d.txt".format(d) g_man = GloveEmbeddings(path=path, dim=d) manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager.load_model("pointer/models/19_05_11b/glove_{}.pt".format(d)) results.append(test_example(manager)) result_names.append("glove_{}".format(d)) # glove + bpemb for g_d, b_d in [(200, 50), (300, 25)]: path = "embeddings/glove/glove.6B.{}d.txt".format(g_d) g_man = GloveEmbeddings(path=path, dim=g_d) b_man = BPEmbeddings(dim=b_d, bp_vocab_size=100000) c_man = CombinedEmbeddings([g_man, b_man]) ds = DataSet("blah") ds.read_multiple(train_sets + dev_set + itac_test + conll_test) c_man.build_vocabulary([ds]) manager = PointerManager(g_man, "basic", learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE, cuda_device=CUDA_DEVICE) manager.load_model( "pointer/models/19_05_11b/glove_d{}_bp_d{}.pt".format(g_d, b_d)) results.append(test_example(manager)) result_names.append("glove_d{}_bp_d{}_vs100000".format(g_d, b_d)) write_results("results/19_05_11b/pointer_examples.txt", results=results, names=result_names)
def __init__(self, sess, epoch, batch_size, dataset_name, checkpoint_dir, result_dir, log_dir, learning_rate=0.00001, beta1=0.5): self.sess = sess self.dataset_name = dataset_name self.result_dir = result_dir self.log_dir = log_dir self.epoch = epoch self.batch_size = batch_size self.beta1 = beta1 if dataset_name == 'BLSD': self.label_dim = 8 self.train_set = DataSet("../dataset/BLSD/img", self.batch_size, self.label_dim) self.log_dir = log_dir + "/BLSD" self.checkpoint_dir = checkpoint_dir + "/BLSD" self.predict_set = DataSet("../predictset/BLSD", 1, self.label_dim) self.label_name = [ "amusement", "anger", "awe", "contentment", "disgust", "excitement", "fear", "sadness" ] #self.pred_set = DataSet("../BLSD_predset/img", self.batch_size) elif dataset_name == 'kaggle': self.label_dim = 7 self.train_set = DataSet("../dataset/kaggle/training", self.batch_size, self.label_dim) self.test_set = DataSet("../dataset/kaggle/test", 1, self.label_dim) self.log_dir = log_dir + "/kaggle" self.checkpoint_dir = checkpoint_dir + "/kaggle" self.predict_set = DataSet("../predictset/kaggle", 1, self.label_dim) self.label_name = [ "anger", "disgust", "fear", "happy", "sad", "surprise", "neutral" ] # parameters self.input_height = 224 self.input_width = 224 self.output_height = 224 self.output_width = 224 self.c_dim = 3 # train self.learning_rate = learning_rate # get number of batches for a single epoch self.num_batches = self.train_set.total_batches self.test_num_batches = self.test_set.total_batches self.predict_num_batches = self.predict_set.total_batches
def train_it(): SAVE_PATH = '/mnt/md1/Experiments/SSAD_Test9' config = Config() ssad = SSAD(config).to(device) # optim = torch.optim.SGD(ssad.parameters(),lr=0.5,momentum=0.9,weight_decay=0.0001) optim = torch.optim.Adam(ssad.parameters(),lr=config.learning_rates[0],weight_decay=0.0001) # dataset_train = DataSet('training',True,'HQZ_DPN107_RGB_FULL') # dataset_val = DataSet('validation',False,'HQZ_DPN107_RGB_FULL') dataset_train = DataSet('training',False,'MIX_RES200_DPN107') dataset_val = DataSet('validation',False,'MIX_RES200_DPN107') TRAIN_ITER = len(dataset_train.vids)//config.batch_size+1 VAL_ITER = len(dataset_val.vids)//config.batch_size+1 for epoch in range(config.training_epochs): ssad.train() dataset_train.pemutate_vids() dataset_val.pemutate_vids() for idx in range(TRAIN_ITER): gF,gL,gB,gI = dataset_train.nextbatch(config.batch_size) gF = np.transpose(gF,(0,2,1)) gF = torch.from_numpy(gF).to(device).float() gL = torch.from_numpy(gL).to(device).long() gB = torch.from_numpy(gB).to(device).float() ssad.zero_grad() train_loss,_,_ = SSAD_Train(ssad,gF,gL,gB,gI,config) train_loss.backward() optim.step() print('Train: {} {}/{} train_loss: {}'.format(epoch,idx,TRAIN_ITER,train_loss.item()),flush=True) if epoch%2==0: ssad.eval() for idx in range(VAL_ITER): with torch.no_grad(): gF,gL,gB,gI = dataset_val.nextbatch(config.batch_size) gF = np.transpose(gF,(0,2,1)) gF = torch.from_numpy(gF).to(device).float() gL = torch.from_numpy(gL).to(device).long() gB = torch.from_numpy(gB).to(device).float() val_loss,_,_ = SSAD_Train(ssad,gF,gL,gB,gI,config) print('Test: {} {}/{} test_loss: {}'.format(epoch,idx,VAL_ITER,val_loss.item()),flush=True) # save model save_modle(ssad,SAVE_PATH+'/ssad_resnet200_2048_{:03d}.pth'.format(epoch)) # change learning rate change_optim_lr(optim,config.learning_rates[epoch])
def moving_extract(self, window=30, date=None, open_prices=None, close_prices=None, high_prices=None, low_prices=None, volumes=None, N_predict=1, flatten=True): self.extract(open_prices=open_prices, close_prices=close_prices, high_prices=high_prices, low_prices=low_prices, volumes=volumes) feature_arr = numpy.asarray(self.feature) p = 0 rows = feature_arr.shape[0] print("feature dimension: %s" % rows) all_data = DataSet([], [], []) predict = DataSet([], [], []) while p + window <= feature_arr.shape[1]: # The last self.prospective days can not produce complete labels if feature_arr.shape[1] - (p + window) >= N_predict: x = feature_arr[:, p:p + window] # Label the closing price of the next day -days y = make_label(close_prices, p + window, self.prospective) d = list(date[p + window:p + window + self.prospective]) if flatten: x = x.flatten("F") all_data.features.append(numpy.nan_to_num(x)) all_data.labels.append(y) all_data.date.append(d) else: x = feature_arr[:, p:p + window] if flatten: x = x.flatten("F") predict.features.append(numpy.nan_to_num(x)) predict.date.append(date[p + window - 1]) predict.closing_price.append(close_prices[p + window - 1]) predict.last_label.append(close_prices[p + window - 2]) p += 1 all_data._features = numpy.asarray(all_data.features) all_data._labels = numpy.asarray(all_data.labels) all_data._date = numpy.asarray(all_data.date) predict._features = numpy.asarray(predict.features) predict._date = numpy.asarray(predict.date) predict._last_label = numpy.asarray(predict.last_label) predict._closing_price = numpy.asarray(predict.closing_price) return all_data, predict
def import_mnist(): """ This import mnist and saves the data as an object of our DataSet class :return: """ SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' VALIDATION_SIZE = 0 ONE_HOT = True TRAIN_DIR = 'MNIST_data' local_file = base.maybe_download(TRAIN_IMAGES, TRAIN_DIR, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(open(local_file)) local_file = base.maybe_download(TRAIN_LABELS, TRAIN_DIR, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(open(local_file), one_hot=ONE_HOT) local_file = base.maybe_download(TEST_IMAGES, TRAIN_DIR, SOURCE_URL + TEST_IMAGES) test_images = extract_images(open(local_file)) local_file = base.maybe_download(TEST_LABELS, TRAIN_DIR, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(open(local_file), one_hot=ONE_HOT) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] ## Process images train_images = process_mnist(train_images) validation_images = process_mnist(validation_images) test_images = process_mnist(test_images) ## Standardize data train_mean, train_std = get_data_info(train_images) # train_images = standardize_data(train_images, train_mean, train_std) # validation_images = standardize_data(validation_images, train_mean, train_std) # test_images = standardize_data(test_images, train_mean, train_std) # data = DataSet(train_images, train_labels) # test = DataSet(test_images, test_labels) # val = DataSet(validation_images, validation_labels) data = DataSet(train_images, train_images) test = DataSet(test_images, test_images) val = DataSet(validation_images, validation_images) return data, test, val
def test(): with open('unrelated_vs_all.pkl', 'rb') as input: unrelated_vs_all = pickle.load(input) with open('disagree_vs_all.pkl', 'rb') as input: disagree_vs_all = pickle.load(input) with open('agree_vs_all.pkl', 'rb') as input: agree_vs_all = pickle.load(input) # create the test set with lemmatized bodies test_set = DataSet("csv/test_stances_csc483583.csv", "csv/lemmatized_bodies.csv") # create an original set that has original bodies orig_set = DataSet("csv/test_stances_csc483583.csv", "csv/train_bodies.csv") stances = test_set.stances articles = test_set.articles orig_articles = orig_set.articles gold = [] count = 0 for stance in stances: stance_result = "" headline = stance['Headline'] bodyID = stance['Body ID'] #get lemmatized body from DataSet created with lemmatized_bodies.csv body_lemmas = articles[bodyID] #get the original body from DataSet created with train_bodies.csv orig_body = orig_articles[bodyID] count += 1 print("classifying article id: " + str(bodyID)) print("article count: " + str(count)) similarity_score, similar_sentences, max_similarity, negation_average = similarity_feature(headline, body_lemmas, orig_body) neg = max_similarity.get('Negates') if(neg == None): neg = 0 max_score = max_similarity.get('Score') if(max_score == None): max_score = 0.0 # predict stance_result using SVM unrelated_vs_all_result = unrelated_vs_all.predict([[similarity_score, max_score]]) disagree_vs_all_result = disagree_vs_all.predict([[negation_average]]) agree_vs_all_result = agree_vs_all.predict([[similarity_score, max_score]]) if(unrelated_vs_all_result == 1): stance_result = 'unrelated' elif(disagree_vs_all_result == 1): stance_result = 'disagree' elif(agree_vs_all_result == 1): stance_result = 'agree' else: stance_result = 'discuss' gold.append({'Headline': headline, 'Body ID': bodyID, 'Stance': stance_result}) keys = gold[0].keys() with open('csv/gold.csv', 'wb') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(gold)
def init_model(self): print("initilizing network\n") if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs self.model = createDeepLabv3() self.model = nn.DataParallel(self.model, device_ids=self.device_ids).to( self.device) else: self.model = createDeepLabv3().to(self.device) # self.optim = torch.optim.Adam(self.model.parameters(), lr=self.lr, betas=(self.beta_1, self.beta_2)) self.optim = torch.optim.SGD(self.model.parameters(), lr=self.lr) self.criterian = torch.nn.MSELoss(reduction='mean') self.transform = transforms.Compose([ # transforms.RandomResizedCrop(128, scale=(0.08, 1.0), ratio=(0.75, 1.3333333333333333), interpolation=2), # transforms.RandomRotation((-90,90)), # transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=0), # transforms.RandomHorizontalFlip(p=0.8), # transforms.RandomVerticalFlip(p=0.8), # transforms.RandomAffine((-5, 5)), # # transforms.GaussianBlur(kernel_size, sigma=(0.1, 2.0)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) # Dataset作成 # (RGB)の色の平均値と標準偏差 color_mean = (0.485, 0.456, 0.406) color_std = (0.229, 0.224, 0.225) self.train_ = DataSet(img_dir=self.img_dir, mask_dir=self.mask_dir, size=self.im_size, data_type="train") # transform=DataTransform(input_size=1024, color_mean=color_mean, color_std=color_std)) self.valid_ = DataSet(img_dir=self.img_dir, mask_dir=self.mask_dir, size=self.im_size, data_type="validation") # transform=DataTransform(input_size=1024, color_mean=color_mean, color_std=color_std)) self.dataloader_train = DataLoader(self.train_, batch_size=self.batch_size, num_workers=4, shuffle=True) self.dataloader_valid = DataLoader(self.valid_, batch_size=self.batch_size, num_workers=4, shuffle=False) print("initilization done\n")
def import_dataset(dataset, fold): train_X = np.loadtxt('FOLDS/' + dataset + '_ARD_Xtrain__FOLD_' + fold, delimiter=' ') train_Y = np.loadtxt('FOLDS/' + dataset + '_ARD_ytrain__FOLD_' + fold, delimiter=' ') test_X = np.loadtxt('FOLDS/' + dataset + '_ARD_Xtest__FOLD_' + fold, delimiter=' ') test_Y = np.loadtxt('FOLDS/' + dataset + '_ARD_ytest__FOLD_' + fold, delimiter=' ') data = DataSet(train_X, train_Y) test = DataSet(test_X, test_Y) return data, test
def basic_eg1k_checkup(): dss = [] dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_rnd_std')) dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_rnd_kcv')) dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_chr_frm')) dss.append(DataSet('../datasets/', 'gr-qc', 'eg1k_chr_prc')) for ds in dss: check_trn_tst_disjoint(ds) check_trn_symmetric_and_connected(ds)
def read_feature(path, input_shape, prefix): ultimate_features = numpy.loadtxt("%s/%s_feature.%s" % (path, prefix, str(input_shape[0]))) ultimate_features = numpy.reshape(ultimate_features, [-1, input_shape[0], input_shape[1]]) ultimate_labels = numpy.loadtxt("%s/%s_label.%s" % (path, prefix, str(input_shape[0]))) # ultimate_labels = numpy.reshape(ultimate_labels, [-1, 1]) train_set = DataSet(ultimate_features, ultimate_labels) test_features = numpy.loadtxt("%s/%s_feature.test.%s" % (path, prefix, str(input_shape[0]))) test_features = numpy.reshape(test_features, [-1, input_shape[0], input_shape[1]]) test_labels = numpy.loadtxt("%s/%s_label.test.%s" % (path, prefix, str(input_shape[0]))) # test_labels = numpy.reshape(test_labels, [-1, 1]) test_set = DataSet(test_features, test_labels) return train_set, test_set
def train(): #训练数据 data_train, label_train = DataSet.data_from_text("./Hnd/trainyny.txt", 1450) train = DataSet(data_train, label_train, dtype=dtypes.float32) data_test, label_test = DataSet.data_from_text("./Hnd/testyny.txt", 145) test = DataSet(data_test, label_test, dtype=dtypes.float32) DataSetsx = collections.namedtuple('DataSetsx', ['train', 'test']) Data = DataSetsx(train=train, test=test) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver({ 'cnn_w1': W_conv1, 'cnn_w2': W_conv2, 'cnn_w3': W_fc1, 'cnn_w4': W_fc2, 'cnn_b1': b_conv1, 'cnn_b2': b_conv2, 'cnn_b3': b_fc1, 'cnn_b4': b_fc2 }) for i in range(50000): #训练阶段,迭代50000次 batch_X, batch_Y = Data.train.next_batch(100) sess.run(train_step, feed_dict={ xs: batch_X, ys: batch_Y, keep_prob: 0.5 }) accu = 0 if (i % 100 == 0): #每训练100次,测试一次 v_xs = Data.test.images v_ys = Data.test.labels y_pre = sess.run(prediction, feed_dict={ xs: v_xs, keep_prob: 1 }) correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(v_ys, 1)) accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) accu = sess.run(accuracy, feed_dict={ xs: v_xs, ys: v_ys, keep_prob: 1 }) print("accuracy:", accu) if (accu > Target_Accuracy): break saver.save(sess, "./model/cnn.ckpt")
def build(self): cfg = utils.load_config() if os.path.exists(f"{cfg['user']}.csv"): print('Existing csv found, loading the file') dataset = DataSet(cfg, create_csv=False) else: print('No csv found, creating one using segmented data') extract.extract_all(data_path=cfg['data_path'], segments_path=cfg['segments_path']) dataset = DataSet(cfg, create_csv=True) return VideoWidget(dataset, cfg)
def read_ultimate(path, input_shape): ultimate_features = numpy.loadtxt(path + "ultimate_feature." + str(input_shape[0])) ultimate_features = numpy.reshape(ultimate_features, [-1, input_shape[0], input_shape[1]]) ultimate_labels = numpy.loadtxt(path + "ultimate_label." + str(input_shape[0])) # ultimate_labels = numpy.reshape(ultimate_labels, [-1, 1]) train_set = DataSet(ultimate_features, ultimate_labels) test_features = numpy.loadtxt(path + "ultimate_feature.test." + str(input_shape[0])) test_features = numpy.reshape(test_features, [-1, input_shape[0], input_shape[1]]) test_labels = numpy.loadtxt(path + "ultimate_label.test." + str(input_shape[0])) # test_labels = numpy.reshape(test_labels, [-1, 1]) test_set = DataSet(test_features, test_labels) return train_set, test_set
def import_dataset(dataset, k_fold): path_train_1 = os.path.join(path_hdf5, 'fold_0.hdf') hf_0 = h5py.File(path_train_1, 'r') train_X = loading_data(hf_0) test_X = train_X hf_0.close() data = DataSet(train_X, train_X) test = DataSet(test_X, test_X) return data, test
def read_dataset(folder_name, debug=False): f = gzip.open(folder_name, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() n_samples = train_set[0].shape[0] if debug: n_samples = 10000 datasets_template = collections.namedtuple('Datasets_template', ['train', 'validation', 'test']) Datasets = datasets_template(train=DataSet(train_set[0][:n_samples, :], train_set[1]), validation=DataSet(valid_set[0], valid_set[1]), test=DataSet(test_set[0], test_set[1])) return Datasets
def test_constructor_reads_from_file_and_concats(self): one_result: pd.DataFrame = DataSet(["./fixtures/nine_records.csv"], PreprocessorSpy(), TrainerStub())._df assert_that(one_result).is_not_none() assert_that(len(one_result)).is_equal_to(9) two_results: pd.DataFrame = \ DataSet(["./fixtures/nine_records.csv", "./fixtures/four_records.csv"], PreprocessorSpy(), TrainerStub()) \ ._df assert_that(two_results).is_not_none() assert_that(len(two_results)).is_equal_to(13)