def createIntegrator(self): vx = Dataset('VX') vy = Dataset('VY') self.velocityIntegrator = FlowIntegrator(vx, vy) bx = Dataset('surfaceGradX') by = Dataset('surfaceGradY') self.surfaceIntegrator = FlowIntegrator(bx, by)
def get_iterator(mode): normalize = transforms.Normalize(mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) kwargs = {'num_workers': 4, 'pin_memory': True} transform_augment = transforms.Compose([ # transforms.RandomResizedCrop(args.size, scale=(0.8, 1.2)), # random scale 0.8-1 of original image area, crop to args.size transforms.RandomResizedCrop(size), transforms.RandomRotation(15), # random rotation -15 to +15 degrees transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform = transforms.Compose([transforms.Resize((size, size)), transforms.ToTensor(), normalize, ]) if mode: dataset = Dataset.MURA(split="train", transform=(transform_augment if augment else transform), type=type) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, **kwargs) else: dataset = Dataset.MURA(split="test", transform=transform, type=type) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, **kwargs) return loader
def main(): file = IOHelper().checkArg(sys.argv) if (len(file) < 1): print("Missing file") exit(1) d = Dataset() d.loadFile(file[0]) fig, axes = plt.subplots(figsize=(18, 10)) fig.tight_layout() start = 6 width = 13 widthStart = 0 widthEnd = widthStart + width ystart = start for i in range(width): drawOneSub(d, start, ystart, range(widthStart, widthEnd)) widthStart += width widthEnd += width start += 1 print("") # plt.title(d.getName(index)) plt.savefig('scatter_plot.png') plt.show()
def loadDataset(): dataFile = pd.read_csv("data/full_context_PeerRead.csv") column = ['left_citated_text', 'right_citated_text', 'target_id', 'source_id', 'target_year', 'target_author'] df = dataFile[column] df = cut_off_dataset(df, config.FREQUENCY) df = slicing_citation_text(df, config.SEQ_LENGTH) trainDF, testDF = split_dataset(df, config.YEAR) trainDF, testDF, labelGenerator = get_label(df, trainDF, testDF) trainDF = trainDF.reset_index(drop=True) testDF = testDF.reset_index(drop=True) trainDatatset = Dataset.BertBaseDataset( contextLeft=trainDF["leftSTRING"].values, targetIndex = trainDF["LabelIndex"].values, contextRight=trainDF["rightSTRING"].values, isRight = config.isRight ) testDatatset = Dataset.BertBaseDataset( contextLeft=testDF["leftSTRING"].values, targetIndex = testDF["LabelIndex"].values, contextRight=testDF["rightSTRING"].values, isRight = config.isRight ) return trainDatatset, testDatatset, labelGenerator
def __init__(self, **kwargs): params = set([ 'learning_rate', 'max_epochs', 'display_step', 'std_dev', 'dataset_train', 'dataset_valid', 'dataset_test' ]) # initialize all allowed keys to false self.__dict__.update((key, False) for key in params) # and update the given keys by their given values self.__dict__.update( (key, value) for key, value in kwargs.items() if key in params) if (self.dataset_train != False and self.dataset_valid != False): # Load the Training Set self.train_imgs_lab = Dataset.loadDataset(self.dataset_train) self.valid_imgs_lab = Dataset.loadDataset(self.dataset_valid) else: # Load the Test Set self.test_imgs_lab = Dataset.loadDataset(self.dataset_test) # Graph input self.img_pl = tf.placeholder( tf.float32, [None, RE_IMG_SIZE, RE_IMG_SIZE, n_channels]) self.label_pl = tf.placeholder(tf.float32, [None, Dataset.NUM_LABELS]) self.keep_prob = tf.placeholder( tf.float32) # dropout (keep probability)
def main(): train_list = Dataset.make_datapath_list._make_datapath_list("tranings") size = 28 #dataLoaderを作成 train_dataset = Dataset.MyDataset(file_list=train_list, transform=Dataset.ImageTransform(size), phase='train') test_dataset = Dataset.testDataset(transform=Dataset.ImageTransform(size), phase='val') train_dataloder = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True) test_dataloder = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False) #辞書型変数にまとめる dataloders_dict = {"train": train_dataloder, "val": test_dataloder} #損失関数を設定 criterion = nn.CrossEntropyLoss() model = Network.Net(10) model = model.to("cuda") optimizer = optim.Adam(model.parameters(), lr=0.0001) val_loss_list, train_loss_list, val_acc_list, train_acc_list = train_model( model, dataloders_dict, criterion, optimizer, num_epochs=30) torch.save(model.state_dict(), "mnist_cnn.pth")
def setDataset(self): try: self.dataset = Dataset( raw_input("Enter the name of the dataset to import: ")) except: print "Error loading dataset. Try again!" self.setDataset()
def main(): ########### 读取配置文件 ########## ch = config.ConfigHandler("./config.ini") ch.load_config() ########### 读取参数 ########## train_batch_size = int(ch.config["model"]["train_batch_size"]) test_batch_size = int(ch.config["model"]["test_batch_size"]) num_epochs = int(ch.config["model"]["num_epochs"]) learning_rate = float(ch.config["model"]["learning_rate"]) class_size = int(ch.config["model"]["class_size"]) ########### 读取log和model ########## log_interval = int(ch.config["log"]["log_interval"]) version_name = ch.config["log"]["version_name"] train_file = ch.config["data"]["train_file"] test_file = ch.config["data"]["test_file"] ########### 获取训练数据loader ########## data_train = Dataset.ImageDataset(train_file, train=True) data_loader_train = torch.utils.data.DataLoader( dataset=data_train, batch_size=train_batch_size, shuffle=True) ########### 获取测试数据loader ########## data_test = Dataset.ImageDataset(test_file, train=False) data_loader_test = torch.utils.data.DataLoader(dataset=data_test, batch_size=test_batch_size, shuffle=False) ########### 训练和评价 ########## train.train_and_test(num_epochs, learning_rate, class_size, data_loader_train, data_loader_test, log_interval, version_name).train_epoch()
def TransferedLearning(): start_time = time.time() model = Models.GetTrainedWithImageNetByName(GlobalVariables.MODEL_TO_USE) model.summary() model.compile(optimizer='adam', loss=keras.losses.mean_squared_error, metrics=['accuracy']) image_height, image_length, color_depth = Models.Shapes[ GlobalVariables.MODEL_TO_USE] history = model.fit( Dataset.getTrainingDatasetGenerator(image_height, image_length, color_depth == 1), epochs=GlobalVariables.EPOCHS, validation_data=Dataset.getTestingDatasetGenerator( image_height, image_length, color_depth == 1), callbacks=[ keras.callbacks.ModelCheckpoint( "D:/Trained_models/{epoch:02d}e-{accuracy:.4f}-{val_accuracy:.4f}.h5", monitor='val_accuracy', save_best_only=True, verbose=0), keras.callbacks.CSVLogger('Training_histories/Test.csv', append=True, separator=';') ]) print("Training time in sec:", time.time() - start_time) Display.TrainingHistory(history)
def calcStochasticGradient(): r = 0.1 w = [0.0, 0.0, 0.0, 0.0] weights = [] weights.append([0.0, 0.0, 0.0, 0.0]) newWeight = [0.0, 0.0, 0.0, 0.0] gradients = [] trainingData = Dataset('pr2Training.csv') # Compute gradient for example in trainingData.getExampleList(): gradient = [] lastWeight = copy.deepcopy(newWeight) yi, xi = getYiXi(example) for index in range(0, len(w)): xij = xi[index] gradient.append( ((yi - np.dot(np.transpose(w), np.array(xi))) * xij)) newWeight[index] = lastWeight[index] + r * gradient[index] gradients.append(gradient) weights.append(copy.deepcopy(newWeight)) print "weights: " for weight in weights: print str(weight) print "gradient: " for gradient in gradients: print str(gradient)
def __init__(self, data, labels, *args, **kwargs): super(Player, self).__init__(*args, **kwargs) self.data = Dataset(data) self.file_list = [] for f in self.data.get_files(): self.file_list.append(f.split("/")[-1]) self.labelset = LabelSet(self.file_list, labels[0], labels[1]) self.label_colors = {} self.label_colors[''] = self.palette().color(QPalette.Background) for i in labels[0]: self.label_colors[i] = "blue" for i in labels[1]: self.label_colors[i] = "red" self.setWindowTitle("test") self.status = {"playing": False} self.image_frame = bboxCanvas(848, 480) self.video_timer = QTimer() self.video_timer.timeout.connect(self.next_frame) self.video_timer.setInterval(30) self.createLabelBar() self.createVideoBar() self.makeDock() label_layout = QHBoxLayout() self.label_list_widget = QListWidget() self.label_list_widget.setFlow(QListView.LeftToRight) self.label_list_widget.setMaximumHeight(30) self.activeBox = -1 lCycleBtn = QPushButton("<<") lCycleBtn.clicked.connect(self.boxCycleDown) rCycleBtn = QPushButton(">>") rCycleBtn.clicked.connect(self.boxCycleUp) remBtn = QPushButton("remove") remBtn.clicked.connect(self.remActiveBox) label_layout.addWidget(self.label_list_widget) label_layout.addWidget(lCycleBtn) label_layout.addWidget(rCycleBtn) label_layout.addWidget(remBtn) self.image_frame.new_box_signal.connect(self.mark_box) mainWidget = QWidget() layout = QVBoxLayout() layout.addWidget(self.labelBar) layout.addLayout(label_layout) layout.addWidget(self.image_frame) layout.addWidget(self.videoBar) mainWidget.setLayout(layout) self.setCentralWidget(mainWidget) self.label_range = {"start": [0, 0], "end": [0, 0]} self.render_frame() self.fillLabels()
def __init__(self): self.ds = Dataset() self.allMethods = self.ds.getAllMethod() contents = [] for method in self.allMethods: contents.append(method.content) self.tfidf = TFIDFAlg(contents) self.calcuatedsimi = {}
def main(): ########### 读取配置文件 ########## ch = config.ConfigHandler("./config.ini") ch.load_config() ########### 读取参数 ########## train_batch_size = int(ch.config["model"]["train_batch_size"]) valid_batch_size = int(ch.config["model"]["valid_batch_size"]) test_batch_size = int(ch.config["model"]["test_batch_size"]) num_epochs = int(ch.config["model"]["num_epochs"]) learning_rate = float(ch.config["model"]["learning_rate"]) class_size = int(ch.config["model"]["class_size"]) ########### 读取log和model ########## log_interval = int(ch.config["log"]["log_interval"]) version_name = ch.config["log"]["version_name"] train_file = ch.config["data"]["train_file"] valid_file = ch.config["data"]["valid_file"] test_file = ch.config["data"]["test_file"] ########### 预测结果输出 ########## pred_file = ch.config["save"]["pred_file"] ########### 获取训练数据loader ########## data_train = Dataset.ImageDataset(train_file, train=True) data_loader_train = torch.utils.data.DataLoader( dataset=data_train, batch_size=train_batch_size, shuffle=True) ########### 获取验证数据loader ########## data_valid = Dataset.ImageDataset(valid_file, train=False) data_loader_valid = torch.utils.data.DataLoader( dataset=data_valid, batch_size=valid_batch_size, shuffle=True) ########### 获取测试数据loader ########## data_test = Dataset.ImageDataset(test_file, train=False) data_loader_test = torch.utils.data.DataLoader(dataset=data_test, batch_size=test_batch_size, shuffle=False) ########### 训练和评价 ########## trainer = train.train_and_test(num_epochs, learning_rate, class_size, data_loader_train, data_loader_valid, data_loader_test, log_interval, version_name, pred_file) ########## start train ########### print("start train") begin_time = time() trainer.train_epoch() end_time = time() run_time = end_time - begin_time print('cost time:', run_time) ########## start eval ########### print("start test") trainer.test()
def calcFlux(self): self.vxInterp = Dataset('VX') self.vyInterp = Dataset('VY') trueFlux = self.calcTrueFlux() midFlux = self.calcMidFlux() sampleFlux = self.calcSampleFlux() naiveFlux = self.calcNaiveFlux()
def main(argv=None): voc = Wordlist('./data/wordlist.txt') trainset = Dataset('./data/train.txt', voc, BATCH_SIZE) devset = Dataset('./data/train.txt', voc, BATCH_SIZE) trainset_label = Label('./data/train_label.txt') devset_label = Label('./data/train_label.txt') print "data loaded!" train(trainset,devset,trainset_label,devset_label,voc)
def save_net_results(model_name, dataset_name): if dataset_name == 'mnist': train_images, train_labels, test_images, test_labels = datasets.mnist() elif dataset_name == 'shapes': train_images, train_labels, test_images, test_labels = datasets.shapes( 'img/shapes') elif dataset_name == 'alienator': train_images, train_labels, test_images, test_labels = datasets.alienator( 'img', 'train_keypoints.txt', 'test_keypoints.txt', rotated=False) elif dataset_name == 'alienator_custom': train_images, train_labels, test_images, test_labels = datasets.alienator( '.', 'train_keypoints_custom.txt', 'test_keypoints_custom.txt', rotated=False, kp_size_multiplier=30) elif dataset_name == 'alienator_custom_ns': train_images, train_labels, test_images, test_labels = datasets.alienator( '.', 'train_keypoints_custom_ns.txt', 'test_keypoints_custom_ns.txt', rotated=False, kp_size_multiplier=30) elif dataset_name == 'alienator2': train_images, train_labels, test_images, test_labels = datasets.alienator( 'img', 'train_keypoints2.txt', 'test_keypoints2.txt', rotated=False) else: train_images, train_labels, test_images, test_labels = datasets.brown( dataset_name) train_dataset = Dataset(train_images, train_labels, size=64) test_dataset = Dataset(test_images, test_labels, mean=train_dataset.mean, std=train_dataset.std, size=64) network_desc = NetworkDesc(model_file=model_name + '.h5') # network_desc = NetworkDescPN(model_file=model_name + '.h5') batch = test_dataset.get_batch_triplets(100000) positives_net, negatives_net = get_positives_negatives( get_net_descriptors(network_desc, batch[0]), get_net_descriptors(network_desc, batch[1]), get_net_descriptors(network_desc, batch[2])) results_dir = 'results/{}/'.format(model_name) if not os.path.isdir(results_dir): os.makedirs(results_dir) np.save('{}{}.positives'.format(results_dir, dataset_name), positives_net) np.save('{}{}.negatives'.format(results_dir, dataset_name), negatives_net)
def runOnServoDataset(): data = Dataset(name="servo", directory="./datasets/") run(data, 'servo') if (False): X, Y = data.get_dataset() Y = np.array([Y]) Y = Y.transpose() print("Shape X " + str(X.shape)) print("Shape Y " + str(Y.shape)) fun = RegressioneLineare(X, Y) pendec = PenaltyDecomposition(fun, x_0=np.array([X[0]]).transpose(), gamma=1.1, max_iterations=5, l0_constraint=15, tau_zero=1) pendec.start() inexact = InexactPenaltyDecomposition(fun, x_0=np.array([X[0]]).transpose(), gamma=1.1, max_iterations=5, l0_constraint=15, tau_zero=1) inexact.start() dfpd = DFPenaltyDecomposition(fun, x_0=np.array([X[0]]).transpose(), gamma=1, max_iterations=1, l0_constraint=15, tau_zero=2) dfpd = DFPenaltyDecomposition(fun, x_0=np.array([X[0]]).transpose(), gamma=1.1, max_iterations=3, l0_constraint=15, tau_zero=1) dfpd = DFPenaltyDecomposition(fun, x_0=np.array([np.ones(fun.number_of_x) ]).transpose(), gamma=1.1, max_iterations=3, l0_constraint=15, tau_zero=1) dfpd = DFPenaltyDecomposition(fun, x_0=x0, gamma=1.1, max_iterations=1, l0_constraint=15, tau_zero=1) dfpd.start()
def __init__(self): self.d = Dataset() self.startMonth = self.d.monthNames[0][ 2:5] + " 20" + self.d.monthNames[0][5:] self.endMonth = self.d.monthNames[len(self.d.monthNames) - 1][2:5] + " 20" + self.d.monthNames[ len(self.d.monthNames) - 1][5:] self.times = [] for i in range(len(self.d.months)): self.times.append(i)
def main(): # Limit GPU usage limit_gpu() # Project configurations config = Config.Config() # Convert numpy datasets to tfrecord datasets Dataset.convert_numpy_to_tfrecord(config, False) # Train model Train.train_model(config)
def Filter(nn, ds, predicate): ret = Dataset(ds.LabelCount()) for i in range(ds.Count()): datum = ds.GetDatum(i) ground_label = ds.GetLabel(i) if predicate(nn,datum,ground_label): ret.Data.Add(Dataset.MemAccessor(datum)) ret.Labels.Add(Dataset.MemAccessor(ground_label)) return ret
class Split: """This class is the model of a split. While it is very similar to a DecisionTree, the main difference is that the split contains the real associated left and right datasets. Moreover, the Split method contains also a method allowing easily to compute the gain of the split. @see DecisionTree.py """ def __init__(self, is_numerical): self.is_numerical = is_numerical self.left = Dataset() self.right = Dataset() self.feature_index = None if is_numerical: self.feature_range = None else: self.feature_range = {} self.gain = -1 def add_category_range( self, value ): self.feature_range[ value ] = True def set_numerical_range( self, value ): self.feature_range = float(value) def place(self, records, index): """Puts the records in the good side, with respect to the feature present at the given index. Also updates value of gini and gain """ self.feature_index = index for r in records: if self.is_numerical and float(r.features[ self.feature_index ]) <= self.feature_range: side = self.left elif not self.is_numerical and r.features[ self.feature_index ] in self.feature_range: side = self.left else: side = self.right side.append( r ) self.left.update() self.right.update() # compute gain self.left_gini = self.left.gini self.right_gini = self.right.gini l, r, n = self.left.size, self.right.size, float(records.size) self.gain = records.gini - (l/n)*self.left_gini - (r/n)*self.right_gini
def __init__(self, is_numerical): self.is_numerical = is_numerical self.left = Dataset() self.right = Dataset() self.feature_index = None if is_numerical: self.feature_range = None else: self.feature_range = {} self.gain = -1
def main(): img_path = "lena.png" dataset_path = "/home/martin/datasets/flickr/thumbnails" dataset_csv = "dataset.csv" block_size = 64 dataset = Dataset(dataset_path) #dataset.create() file_paths, rgb_means = dataset.load(dataset_csv) collage = Collage(img_path, file_paths, rgb_means, block_size) img_collage = collage.create() io.imsave("collage.png", img_collage)
def create_dataset_icdar2015(img_root, gt_root, output_path): im_list = os.listdir(img_root) im_path_list = [] gt_list = [] for im in im_list: name, _ = os.path.splitext(im) gt_name = 'gt_' + name + '.txt' gt_path = os.path.join(gt_root, gt_name) if not os.path.exists(gt_path): print('Ground truth file of image {0} not exists.'.format(im)) im_path_list.append(os.path.join(img_root, im)) gt_list.append(gt_path) assert len(im_path_list) == len(gt_list) Dataset.create_dataset(output_path, im_path_list, gt_list)
def __init__(self, **kwargs): params = set(['learning_rate','max_epochs','display_step','dataset_training','dataset_test']) # initialize all allowed keys to false self.__dict__.update((key, False) for key in params) # and update the given keys by their given values self.__dict__.update((key, value) for key, value in kwargs.iteritems() if key in params) if(self.dataset_training != False): self.train_imgs_lab = Dataset.loadDataset(self.dataset_training) else: self.test_imgs_lab = Dataset.loadDataset(self.dataset_test) # Store layers weight & bias self.weights = { 'wc1': tf.Variable(tf.random_normal([11, 11, n_channels, BATCH_SIZE], stddev=std_dev)), 'wc2': tf.Variable(tf.random_normal([5, 5, BATCH_SIZE, BATCH_SIZE*2], stddev=std_dev)), 'wc3': tf.Variable(tf.random_normal([3, 3, BATCH_SIZE*2, BATCH_SIZE*4], stddev=std_dev)), 'wc4': tf.Variable(tf.random_normal([3, 3, BATCH_SIZE*4, BATCH_SIZE*4], stddev=std_dev)), 'wc5': tf.Variable(tf.random_normal([3, 3, BATCH_SIZE*4, 256], stddev=std_dev)), 'wd': tf.Variable(tf.random_normal([1024, 4096])), 'wfc': tf.Variable(tf.random_normal([4096, 1024], stddev=std_dev)), 'out': tf.Variable(tf.random_normal([1024, n_classes], stddev=std_dev)) } self.biases = { 'bc1': tf.Variable(tf.random_normal([BATCH_SIZE])), 'bc2': tf.Variable(tf.random_normal([BATCH_SIZE*2])), 'bc3': tf.Variable(tf.random_normal([BATCH_SIZE*4])), 'bc4': tf.Variable(tf.random_normal([BATCH_SIZE*4])), 'bc5': tf.Variable(tf.random_normal([256])), 'bd': tf.Variable(tf.random_normal([4096])), 'bfc': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([n_classes])) } # Graph input self.img_pl = tf.placeholder(tf.float32, [None, n_input, n_channels]) self.label_pl = tf.placeholder(tf.float32, [None, n_classes]) self.keep_prob_in = tf.placeholder(tf.float32) self.keep_prob_hid = tf.placeholder(tf.float32) # Create a saver for writing training checkpoints. self.saver = tf.train.Saver()
def create_specialized_csv(target_type, train_samples, test_samples, keep_existing_cache, data_folder='./data', use_complete_dataset=True): train_csv_file = data_folder + '/' + target_type + '_train_' + str( train_samples) + '.csv' test_csv_file = data_folder + '/' + target_type + '_test_' + str( test_samples) + '.csv' if keep_existing_cache and os.path.isfile( train_csv_file) and os.path.isfile(test_csv_file): # If csv files already exist, then keep them # TODO: Check content size of files to make sure we have the same amount of samples return train_csv_file, test_csv_file ds = pd.read_csv('stage_1_train_nice.csv') if test_samples <= 1: test_samples = math.floor(ds[ds[target_type] == 1].shape[0] * test_samples) if train_samples <= 1: if use_complete_dataset: # Use the complete dataset. Copy the dataset which is smaller if ds[ds[target_type] != 1].shape[0] > ds[ds[target_type] == 1].shape[0]: total_examples = ds[ds[target_type] != 1].shape[0] else: total_examples = ds[ds[target_type] == 1].shape[0] train_samples = math.floor(total_examples - test_samples) else: total_examples = ds[ds[target_type] == 1].shape[0] train_samples = math.floor(total_examples * train_samples) dataset = ds[ds[target_type] == 1].sample(test_samples) ds = ds.drop(dataset.index) none_ds = ds[ds[target_type] == 0].sample(test_samples) ds = ds.drop(none_ds.index) test_ds = pd.concat([dataset, none_ds]).sample(frac=1) test_ds.to_csv(test_csv_file, index=None, header=True) dataset = ds[ds[target_type] == 1].sample(train_samples, replace=True) ds = ds.drop(dataset.index) none_ds = ds[ds[target_type] == 0].sample(train_samples, replace=True) ds = ds.drop(none_ds.index) train_ds = pd.concat([dataset, none_ds]).sample(frac=1) train_ds.to_csv(train_csv_file, index=None, header=True) return train_csv_file, test_csv_file
def main(): parser = argparse.ArgumentParser(description='A convolutional neural network for image recognition') subparsers = parser.add_subparsers() common_args = [ (['-lr', '--learning-rate'], {'help':'learning rate', 'type':float, 'default':0.05}), (['-e', '--epochs'], {'help':'epochs', 'type':int, 'default':2}), (['-ds', '--display-step'], {'help':'display step', 'type':int, 'default':10}), (['-sd', '--std-dev'], {'help':'std-dev', 'type':float, 'default':0.1}), (['-d', '--dataset'], {'help':'dataset file', 'type':str, 'default':'test_dataset.p'}) ] parser_train = subparsers.add_parser('train') parser_train.set_defaults(which='train') for arg in common_args: parser_train.add_argument(*arg[0], **arg[1]) parser_preprocess = subparsers.add_parser('preprocessing') parser_preprocess.set_defaults(which='preprocessing') parser_preprocess.add_argument('-f', '--file', help='output file', type=str, default='images_dataset.p') parser_preprocess.add_argument('-s', '--shuffle', help='shuffle dataset', action='store_true') parser_preprocess.set_defaults(shuffle=False) parser_predict = subparsers.add_parser('predict') parser_predict.set_defaults(which='predict') for arg in common_args: parser_predict.add_argument(*arg[0], **arg[1]) args = parser.parse_args() if args.which in ('train'): log.basicConfig(filename='FileLog.log', level=log.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode="w") if args.which in ('train', 'predict'): # create the object ConvNet conv_net = ConvNet(args.learning_rate, args.epochs, args.display_step, args.std_dev, args.dataset) if args.which == 'train': # TRAINING log.info('Start training') conv_net.training() else: # PREDICTION conv_net.prediction() elif args.which == 'preprocessing': # if args.shuffle: # shuffle(args.file) # else: Dataset.saveDataset(IMAGE_DIR, args.file)
def train(model_name, restore=True): import_lib() global config, logger config = Config.config dataset = Dataset.Dataset() dataset.prepare_dataset() logger = utils.get_logger(model_name) model = PHVM.PHVM(len(dataset.vocab.id2featCate), len(dataset.vocab.id2featVal), len(dataset.vocab.id2word), len(dataset.vocab.id2category), key_wordvec=None, val_wordvec=None, tgt_wordvec=dataset.vocab.id2vec, type_vocab_size=len(dataset.vocab.id2type)) init = {'epoch': 0, 'worse_step': 0} if restore: init['epoch'], init['worse_step'], model = model_utils.restore_model( model, config.checkpoint_dir + "/" + model_name + config.tmp_model_dir, config.checkpoint_dir + "/" + model_name + config.best_model_dir) config.check_ckpt(model_name) summary = tf.summary.FileWriter(config.summary_dir, model.graph) _train(model_name, model, dataset, summary, init) logger.info("finish training {}".format(model_name))
def Information_gain(dataset, node): info_gain = None print "Information gain heurisitic" entropy_set = Entropy_Set(dataset) neg_dict, pos_dict = Dataset.split_dataset(dataset, node) entropy_members = Entropy_Members(dataset,neg_dict,pos_dict,node) if entropy_set == 'F': info_gain = 0 print "Info gain for all negative examples", info_gain return (info_gain,'NA',{},{}) elif entropy_set == 'T': info_gain = 1 print "Info gain for all positive examples", info_gain return (info_gain,'NA',{},{}) else: info_gain = entropy_set - entropy_members info_gain = float(format(info_gain,".4f")) print "Info gain for ", node,": " , info_gain return (info_gain, node, neg_dict, pos_dict)
def main(argv=None): voc = Wordlist('./data/wordlist.txt') testset = Dataset('./data/train.txt', voc, BATCH_SIZE) testset_label = Label('./data/train_label.txt') print "data loaded!" evaluate(testset, testset_label, voc)
def train(args, model, device, train_loader, optimizer, epoch): model.train() train_loss = 0 correct = 0 for batch_idx, (data, target) in enumerate(train_loader): #print(target.shape) coord,avg_values,A_spatial = dset.prepareGraph(data) #print(coord.shape,avg_values.shape,A_spatial.shape) data = [torch.from_numpy(np.concatenate((coord, avg_values), axis=2)).float().cuda(), torch.from_numpy(A_spatial).float().cuda(), False] #print(len(data)) target = target.to(device) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() train_loss += loss.item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) train_loss = train_loss/(batch_idx+1) torch.save(model.state_dict(),'model_superpixel.pt') print( '\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( train_loss, correct, len(train_loader.dataset), 100. * correct / len(train_loader.dataset))) return train_loss,correct
def main(): args = get_args() if args.train: train(args.model_name, args.restore) else: import_lib() dataset = Dataset.Dataset() model = PHVM.PHVM(len(dataset.vocab.id2featCate), len(dataset.vocab.id2featVal), len(dataset.vocab.id2word), len(dataset.vocab.id2category), key_wordvec=None, val_wordvec=None, tgt_wordvec=dataset.vocab.id2vec, type_vocab_size=len(dataset.vocab.id2type)) best_checkpoint_dir = config.checkpoint_dir + "/" + args.model_name + config.best_model_dir tmp_checkpoint_dir = config.checkpoint_dir + "/" + args.model_name + config.tmp_model_dir model_utils.restore_model(model, best_checkpoint_dir, tmp_checkpoint_dir) dataset.prepare_dataset() texts = infer(model, dataset, dataset.test) dump(texts, config.result_dir + "/{}.json".format(args.model_name)) utils.print_out("finish file test")
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') batch_size = 5 # data loading test_dir = './data_5_5/test' test_dataset = Dataset.RadarGesture(test_dir) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) # model loading model = GestureNet().to(device) print(model) model.load_state_dict(torch.load('model.pth')) # test model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) print('Predicted:', predicted, 'Real:', labels) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the {} test images: {} %'.format( len(test_loader) * batch_size, 100 * correct / total))
def to_dataset(out): ''' converts a list (sample, genes ...) into a dataset ''' all_genes = [] samples = [] dicts = [] #prepare lines for line in out: samples.append(line[0]) all_genes += line[1:] dic = {} for x in line: dic[x] = 1 dicts.append(dic) # select unique genes all_genes = list(set(all_genes)) # prepare lines lines = [] for i in range(len(samples)): line = [samples[i]] for gene in all_genes: try: line.append(dicts[i][gene]) except KeyError: line.append(0) lines.append(line) data = Dataset.Dataset() data.create(all_genes,lines) return data
def main(): parser = argparse.ArgumentParser(description='A convolutional neural network for image recognition') subparsers = parser.add_subparsers() common_args = [ (['-lr', '--learning-rate'], {'help':'learning rate', 'type':float, 'default':0.1}), (['-e', '--epochs'], {'help':'epochs', 'type':int, 'default':5}), (['-ds', '--display-step'], {'help':'display step', 'type':int, 'default':10}), (['-sd', '--std-dev'], {'help':'std-dev', 'type':float, 'default':1.0}), (['-d', '--dataset'], {'help':'dataset file', 'type':str, 'default':'images_dataset.pkl'}) ] parser_train = subparsers.add_parser('train') parser_train.set_defaults(which='train') for arg in common_args: parser_train.add_argument(*arg[0], **arg[1]) parser_preprocess = subparsers.add_parser('preprocessing') parser_preprocess.set_defaults(which='preprocessing') parser_preprocess.add_argument('-f', '--file', help='output file', type=str, default='images_dataset.pkl') parser_predict = subparsers.add_parser('predict') parser_predict.set_defaults(which='predict') for arg in common_args: parser_predict.add_argument(*arg[0], **arg[1]) args = parser.parse_args() log.basicConfig(filename='FileLog.log', level=log.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode="w") if args.which in ('train', 'predict'): t = timeit.timeit("Dataset.loadDataset(IMAGE_DIR)", setup="from __main__ import *") log.info("Execution time of Dataset.loadDataset(IMAGE_DIR) (__main__) = %.4f sec" % t) # create the object ConvNet conv_net = ConvNet(args.learning_rate, args.epochs, args.display_step, args.std_dev, args.dataset) if args.which == 'train': # TRAINING conv_net.training() else: # PREDICTION conv_net.prediction() elif args.which == 'preprocessing': Dataset.saveDataset(IMAGE_DIR, args.file)
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) pC1 = getClassProb(Ytrain, -1) pC2 = getClassProb(Ytrain, 1) wordList = d.getWordList() w1 = [getFeatureProb(Xtrain, Ytrain, -1, wordIndex) for wordIndex in range(len(wordList))] aw1 = np.asarray(w1) w2 = [getFeatureProb(Xtrain, Ytrain, 1, wordIndex) for wordIndex in range(len(wordList))] aw2 = np.asarray(w2) trainError = computeError(Xtrain, Ytrain, pC1, pC2, aw1, aw2) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest, Ytest, pC1, pC2, aw1, aw2) print 'Test error rate is ' + str(testError)
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=200) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) lam = 100 cols = [] currentError = 1 n = Xtrain.shape[1] dic = {} ## i is the number of features to be added to cols for i in range(40): bestJ = 0 bestErrorRate = 1 for j in range(n): cols.append(j) w = trainRidge(Xtrain[:, cols], Ytrain, lam) errorRate = computeError(Xtrain[:, cols], Ytrain, w) if errorRate < bestErrorRate: bestJ = j bestErrorRate = errorRate ## print 'Best error rate is ' + str(bestErrorRate) cols.pop() if bestErrorRate >= currentError: break else: cols.append(bestJ) dic[bestJ] = currentError - bestErrorRate currentError = bestErrorRate print 'Current error rate is ' + str(currentError) w = trainRidge(Xtrain[:, cols], Ytrain, lam) trainError = computeError(Xtrain[:, cols], Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest[:, cols], Ytest, w) print 'Test error rate is ' + str(testError) ## find the top 10 features wordList = d.getWordList() topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)] topCols = topCols[: 10] topFeatures = [wordList[index] for (index, value) in topCols] for f in topFeatures: print f
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=2000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) w = np.asmatrix([0 for elem in range(Xtrain.shape[1])]) learningRate = 1 ## numTrial is the total number of rounds we want to go through before stopping (in case it is not converged) ## k is to keep track of how many rounds we have been through numTrial = 5 k = 0 ## wSum is to count the sum of w in a given round ## wAvg is to count the avg of w in a given round wAvg = w while makeError(Xtrain, Ytrain, wAvg): if k >= numTrial: print "No perfect hyperplane found!" print "Stop after " + str(numTrial) + " iterations." break k += 1 for i in range(Xtrain.shape[0]): expected = -1 xtrain = np.asmatrix(Xtrain[i]).T if w * xtrain > 0: expected = 1 if expected != Ytrain[i]: w = w + learningRate * Ytrain[i] * Xtrain[i] if i == 0: wSum = w else: wSum += w wAvg = wSum / Xtrain.shape[0] trainError = computeError(Xtrain, Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest, Ytest, w) print 'Test error rate is ' + str(testError)
def main(): d = Dataset("rec.sport.hockey.txt", "rec.sport.baseball.txt", cutoff=1000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=100) lam = 100 cols = [] currentError = 1 n = Xtrain.shape[1] dic = {} for j in range(n): cols.append(j) w = trainRidge(Xtrain[:, cols], Ytrain, lam) errorRate = computeError(Xtrain[:, cols], Ytrain, w) if errorRate >= currentError: cols.pop() else: dic[j] = currentError - errorRate currentError = errorRate ## print out currentError once a while if j % 10 == 0: print currentError w = trainRidge(Xtrain[:, cols], Ytrain, lam) trainError = computeError(Xtrain[:, cols], Ytrain, w) print 'Train error rate is ' + str(trainError) testError = computeError(Xtest[:, cols], Ytest, w) print 'Test error rate is ' + str(testError) ## find the top 10 features wordList = d.getWordList() topCols = [(key, value) for key, value in sorted(dic.iteritems(), key = lambda(k, v) : (v, k), reverse = True)] topCols = topCols[: 10] topFeatures = [wordList[index] for (index, value) in topCols] for f in topFeatures: print f
def main(): print("This is a program to compute the min, max, mean and ") print('standard deviation for a set of numbers.\n') data = Dataset() while True: xStr = input('Enter a number (<Enter> to quit): ') if xStr == "": break try: x = float(xStr) except ValueError: print("Invalid Entry Ignored: Input was not a number") continue data.add(x) print('Summary of', data.size(), 'scores.') print('Min: ', data.min()) print('Max: ', data.max()) print('Mean: ', data.mean()) print('Standard Deviation: ', data.std_deviation())
def __init__(self, learning_rate, max_epochs, display_step, std_dev, dataset): # Initialize params self.learning_rate=learning_rate self.max_epochs=max_epochs self.display_step=display_step self.std_dev=std_dev self.dataset = dataset self.gen_imgs_lab = Dataset.loadDataset(dataset) # Store layers weight & bias self.weights = { 'wc1': tf.Variable(tf.random_normal([11, 11, 3, 96], stddev=std_dev)), 'wc2': tf.Variable(tf.random_normal([5, 5, 96, 192], stddev=std_dev)), 'wc3': tf.Variable(tf.random_normal([3, 3, 192, 384], stddev=std_dev)), 'wc4': tf.Variable(tf.random_normal([3, 3, 384, 384], stddev=std_dev)), 'wc5': tf.Variable(tf.random_normal([3, 3, 384, 256], stddev=std_dev)), 'wd': tf.Variable(tf.random_normal([12544, 4096])), 'wfc': tf.Variable(tf.random_normal([4096, 1024], stddev=std_dev)), 'out': tf.Variable(tf.random_normal([1024, n_classes], stddev=std_dev)) } self.biases = { 'bc1': tf.Variable(tf.random_normal([96])), 'bc2': tf.Variable(tf.random_normal([192])), 'bc3': tf.Variable(tf.random_normal([384])), 'bc4': tf.Variable(tf.random_normal([384])), 'bc5': tf.Variable(tf.random_normal([256])), 'bd': tf.Variable(tf.random_normal([4096])), 'bfc': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([n_classes])) } # Graph input self.img_pl = tf.placeholder(tf.float32, [None, n_input, n_channels]) self.label_pl = tf.placeholder(tf.float32, [None, n_classes]) self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) # Create a saver for writing training checkpoints. self.saver = tf.train.Saver()
def test(): ''' Test decision tree learning and classification ''' ##Zoo Example zoo_attributes = ['hair','feathers','eggs', 'milk', 'airborne','aquatic', 'predator','toothed','backbone','breathes','venomous', 'fins', 'legs','tail', 'domestic','catsize'] ds = Dataset(16,1) ##training Dataset ds.loadFromFile('zoo_train.data',1) ##load it from file tst = Dataset(16,1) ##testing Dataset tst.loadFromFile('zoo_test.data',1) attr = [i for i in range(16)] ##[0,1,2...,15] print "++++++++++++++++++++++++++++++++ZOO++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" tree = DTL(ds,attr,mostFrequent(ds.targets),ds,verbose = False) ##make decision tree recallOnDataset(tree,tst) print '+++++++++++++++++++++++++++++END ZOO +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
datasetsOnSites={} datasetSet={} for line in phedexFile: l = line.split() datasetName=l[0] siteName=l[5] if not re.match(datasetPattern,datasetName): continue if re.match(".*BUNNIES.*",datasetName): # get rid of T0 testing datasets # why are these even in DDM? continue if not re.match(siterx,siteName): continue if datasetName not in datasetSet: datasetObject = Dataset(datasetName) datasetSet[datasetName] = datasetObject else: datasetObject = datasetSet[datasetName] datasetObject.isDeleted = False datasetObject.addCurrentSite(siteName,l[6],l[7]) datasetObject = None # remove blacklisted datasets blacklistFile = open(os.environ.get('MONITOR_DB')+'/datasets/blacklist.log','r') blacklistSet = set(map(lambda x : x.split()[0], list(blacklistFile))) removeByKey(datasetSet,blacklistSet) for fileName in sorted(files): if debug>0: print ' Analyzing: ' + fileName
def GrowTree(dataset, attributes, level): print "Constructing Decision Tree" , level max_gain, max_gain_attr, level = 0, None, 0 print "Attributes ", attributes if not attributes: common_val = get_max_val(dataset.get("Class")) root = BTreeNode(str(common_val)) root.left = None root.right = None else: if dataset.has_key('NA'): return BTreeNode(str(dataset.get('NA'))) else: class_list = dataset.get("Class") tmp_negcnt, tmp_poscnt = get_count(class_list) if tmp_poscnt == 0: print "class: all negative examples" return BTreeNode('0') if tmp_negcnt == 0: print "class: all positive examples" return BTreeNode('1') for val in attributes: neg_dict, pos_dict = Dataset.split_dataset(dataset, val) variance_set = Heuristics.Variance_Impurity_Set(class_list) print "Variance Impurity set for ", val ,variance_set member_list = dataset.get(val) variance_member = Heuristics.Variance_Impurity_Members(dataset,neg_dict,pos_dict,member_list) print "Variance Impurity member for ",val,variance_member var_gain = Heuristics.gain(variance_set, variance_member) print "Variance Impurity gain for ",val ,var_gain print "Bool value - zeros" , bool([a for a in neg_dict.values() if a == []]) print "Bool value - ones" , bool([a for a in pos_dict.values() if a == []]) if bool([a for a in neg_dict.values() if a == []]): print "Sub values empty - zero dataset" common_val = get_max_val(dataset.get("Class")) neg_dict = {} neg_dict.update({'NA':common_val}) elif bool([a for a in pos_dict.values() if a == []]): print "Sub values empty - one dataset" common_val = get_max_val(dataset.get("Class")) pos_dict = {} pos_dict.update({'NA':common_val}) if var_gain > max_gain: max_gain = var_gain max_gain_attr = val root_zero_dataset = neg_dict root_one_dataset = pos_dict else: max_gain = var_gain max_gain_attr = val root_zero_dataset = neg_dict root_one_dataset = pos_dict print "Maximum Information Gain: ",max_gain print "Node selected" , max_gain_attr print "Zero Dataset", root_zero_dataset print "One Dataset", root_one_dataset root = BTreeNode(max_gain_attr) if max_gain_attr in attributes: attributes.remove(max_gain_attr) if root != None: root.left = GrowTree(root_zero_dataset,attributes,level) root.right = GrowTree(root_one_dataset,attributes,level) level+= 1 return root
print "|", print temp.data, # leaf node in right subtree if(temp.right.data == '0' or temp.right.data == '1'): print "= 1 :",temp.right.data temp = None else: print "= 1 :" level+=1 temp = temp.right if __name__ == "__main__": attributes = [] att_dict = Dataset.load_dataset('data_sets1/verysmall.csv') for itm in att_dict.keys(): if itm != "Class": attributes.append(itm) print attributes node = GrowTree(att_dict, attributes, 0) print_tree(node)
def setUp(self): self.data = Dataset.loadSmallPickledData() self.net = DeNet()
def runSmall(): data = Dataset.loadSmallPickledData() net = DeNet() net.train(data, 2, 50, 0.1)
def training(self): # Launch the graph with tf.Session() as sess: # Construct model logits, prediction = self.alex_net_model(self.img_pl, self.weights, self.biases, self.keep_prob) # TO check # Define loss and optimizer # http://stackoverflow.com/questions/33922937/why-does-tensorflow-return-nan-nan-instead-of-probabilities-from-a-csv-file # equivalent to # tf.nn.softmax(...) + cross_entropy(...) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, self.label_pl)) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(loss) # Evaluate model correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(self.label_pl, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # Initializing the variables init = tf.initialize_all_variables() # Run the Op to initialize the variables. sess.run(init) summary_writer = tf.train.SummaryWriter(CKPT_DIR, graph=sess.graph) log.info('Dataset created - images list and labels list') log.info('Now split images and labels in Training and Test set...') ################################################################## # collect imgs for test tests_imgs_batches = [b for i, b in enumerate(self.BatchIterator(BATCH_SIZE)) if i < 3] # Run for epoch for epoch in range(self.max_epochs): self.gen_imgs_lab = Dataset.loadDataset(self.dataset) # Loop over all batches #for step in range(num_batch): for step, elems in enumerate(self.BatchIterator(BATCH_SIZE)): batch_imgs_train, batch_labels_train = elems ### create itrator over batch list ### #batch_imgs_train, batch_labels_train = self.BatchIterator(BATCH_SIZE) # ### call next() for next batch of imgs and labels ### # batch_imgs_train, batch_labels_train = iter_.next() # Fit training using batch data _, single_loss = sess.run([optimizer, loss], feed_dict={self.img_pl: batch_imgs_train, self.label_pl: batch_labels_train, self.keep_prob: dropout}) # Display logs per epoch step if step % self.display_step == 0: # print "Step %03d - Epoch %03d/%03d - single_loss %.7f" % (step, epoch, self.max_epochs, single_loss) # log.info("Step %03d - Epoch %03d - single_loss %.7f" % (step, epoch, avg_loss/step, single_loss)) # Calculate training batch accuracy and batch loss train_acc, train_loss = sess.run([accuracy, loss], feed_dict={self.img_pl: batch_imgs_train, self.label_pl: batch_labels_train, self.keep_prob: 1.}) print "Training Accuracy = " + "{:.5f}".format(train_acc) log.info("Training Accuracy = " + "{:.5f}".format(train_acc)) print "Training Loss = " + "{:.6f}".format(train_loss) log.info("Training Loss = " + "{:.6f}".format(train_loss)) print "Optimization Finished!" #print "Accuracy = ", sess.run(accuracy, feed_dict={self.img_pl: batch_imgs_train, self.label_pl: batch_labels_train, self.keep_prob: 1.0}) # Save the models to disk save_model_ckpt = self.saver.save(sess, MODEL_CKPT) print("Model saved in file %s" % save_model_ckpt) # Test accuracy for step, elems in enumerate(tests_imgs_batches): batch_imgs_test, batch_labels_test = elems test_acc = sess.run(accuracy, feed_dict={self.img_pl: batch_imgs_test, self.label_pl: batch_labels_test, self.keep_prob: 1.0}) print "Test accuracy: %.5f" % (test_acc) log.info("Test accuracy: %.5f" % (test_acc))
def runSmall(): data = Dataset.loadSmallPickledData() net = LoopyNet() net.train(data, 30, 50, 0.1)
def setUp(self): # self.data = Dataset.SmallerDataset() self.data = Dataset.loadSmallPickledData() self.net = LoopyNet()
if __name__ == "__main__": global heuristic # print sys.argv attributes = [] l = int(float(sys.argv[1])) k = int(float(sys.argv[2])) trainset = sys.argv[3] testset = sys.argv[4] validset = sys.argv[5] toprint = sys.argv[6] heuristic = int(float(sys.argv[7])) # print "Training Set" train_set = Dataset.load_dataset(trainset) for itm in train_set.keys(): if itm != "Class": attributes.append(itm) node = GrowTree(train_set, attributes) # print "Test Set" test_set = Dataset.load_dataset(testset) accuracy = accuracy_tree(node,test_set) print "Test set Accuracy percentage" , accuracy vaildation_set = Dataset.load_dataset(validset) print "Post Pruning accuracy " , post_pruning(node,l,k,vaildation_set)
def runMedium(): data = Dataset.loadMediumPickledData() net = LoopyNet() net.train(data, 30, 100, 0.1)
def runBig(): data = Dataset.loadPickledData() net = LoopyNet() net.train(data, 30, 100, 0.1)
def GrowTree(dataset, attributes): global cnt_nonleaf_nodes,heuristic max_gain_attr = None max_gain = 0.0 gain = 0.0 # print "Attributes ", attributes if not attributes: common_val = get_max_val(dataset.get("Class")) root = BTreeNode(str(common_val)) root.left = None root.right = None else: if dataset.has_key('NA'): return BTreeNode(str(dataset.get('NA'))) else: class_list = dataset.get("Class") # print class_list tmp_negcnt, tmp_poscnt = get_count(class_list) if tmp_poscnt == 0: print "class: all negative examples" # print class_list return BTreeNode('0') if tmp_negcnt == 0: print "class: all positive examples" # print class_list return BTreeNode('1') for val in attributes: neg_dict, pos_dict = Dataset.split_dataset(dataset, val) # print "Neg dict class" , neg_dict.get("Class") # print "Pos dict class" , pos_dict.get("Class") if heuristic == 0: entropy_set = Heuristics.Entropy_Set(class_list) elif heuristic == 1: variance_set = Heuristics.Variance_Impurity_Set(class_list) # print "Entropy set for ", val ,entropy_set member_list = dataset.get(val) if heuristic == 0: entropy_member = Heuristics.Entropy_Members(dataset,neg_dict,pos_dict,member_list) elif heuristic == 1: variance_member = Heuristics.Variance_Impurity_Members(dataset,neg_dict,pos_dict,member_list) # print "Entropy member for ",val,entropy_member if heuristic == 0: gain = Heuristics.gain(entropy_set, entropy_member) elif heuristic == 1: gain = Heuristics.gain(variance_set, variance_member) print "gain for ",val ,gain if bool([a for a in neg_dict.values() if a == []]): print "Sub values empty - zero dataset" common_val = get_max_val(dataset.get("Class")) neg_dict = {} neg_dict.update({'NA':common_val}) elif bool([a for a in pos_dict.values() if a == []]): print "Sub values empty - one dataset" common_val = get_max_val(dataset.get("Class")) pos_dict = {} pos_dict.update({'NA':common_val}) if gain >= max_gain: max_gain = gain max_gain_attr = val root_zero_dataset = neg_dict # print "inside max gain cal zeros ",val, neg_dict.get("Class") root_one_dataset = pos_dict # print "inside max gain cal ones ",val, pos_dict.get("Class") neg_dict = {} pos_dict = {} # print print "Maximum Information Gain: ",max_gain print "Node selected: " , max_gain_attr print "Zero Dataset: ", root_zero_dataset.get("Class") print "One Dataset: ", root_one_dataset.get("Class") root = BTreeNode(max_gain_attr) cnt_nonleaf_nodes += 1 root.order = cnt_nonleaf_nodes root.subset = dataset if max_gain_attr in attributes: attributes.remove(max_gain_attr) if root != None: # if root.left: root.left = GrowTree(root_zero_dataset,attributes) # if root.right: root.right = GrowTree(root_one_dataset,attributes) return root