def main(_): with tf.Session() as sess: train_dataset = dataset.Dataset( os.path.join(_DATA_DIRECTORY, 'train.data')) test_dataset = dataset.Dataset( os.path.join(_DATA_DIRECTORY, 'test.data')) model = model_lib.Model( sequence_length=train_dataset.GetSequenceLength(), mode=tf.contrib.learn.ModeKeys.TRAIN, learning_rate=FLAGS.learning_rate, momentum_rate=FLAGS.momentum_rate) sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) for i in range(FLAGS.max_steps): train_batch = train_dataset.GetBatch(FLAGS.train_batch_size) feed_dict = { model.sequences_placeholder: train_batch[0], model.true_labels_placeholder: train_batch[1], } model.mode = tf.contrib.learn.ModeKeys.TRAIN _ = sess.run([model.train_op, model.loss_op], feed_dict=feed_dict) model.mode = tf.contrib.learn.ModeKeys.EVAL loss = sess.run([model.loss_op], feed_dict=feed_dict) print ` loss `
def main(args): args.gpu = torch.cuda.is_available() utils.manual_seed(args.seed) Model = utils.load_module(args.model) cache_file = args.fcache or (os.path.join( 'cache', 'data_{}_{}.debug.pt'.format(args.model, args.dataset) if args.debug else 'data_{}_{}.pt'.format(args.model, args.dataset))) splits, ext = torch.load(cache_file, map_location=torch.device('cpu')) splits = {k: dataset.Dataset(v) for k, v in splits.items()} splits['train'] = Model.prune_train(splits['train'], args) splits['dev'] = Model.prune_dev(splits['dev'], args) if args.model == 'nl2sql': Reranker = utils.load_module(args.beam_rank) ext['reranker'] = Reranker(args, ext) m = Model(args, ext).place_on_device() d = m.get_file('') if not os.path.isdir(d): os.makedirs(d) pprint.pprint(m.get_stats(splits, ext)) if not args.test_only: if not args.skip_upperbound: print('upperbound') pprint.pprint(m.compute_upperbound(splits['train'][:1000])) if args.aug: augs = [] for a in args.aug: augs.extend(torch.load(a)) aug = dataset.Dataset(augs) splits['aug'] = Model.prune_train(aug, args)[:args.aug_lim] print('aug upperbound') pprint.pprint(m.compute_upperbound(aug[:10])) # aug_args = copy.deepcopy(args) # if 'consistent' not in args.aug: # aug_args.epoch = 10 # aug_dev = dataset.Dataset(random.sample(splits['train'], 3000)) # m.run_train(aug, aug_dev, args=aug_args) pprint.pprint(m.get_stats(splits, ext)) m.run_train(dataset.Dataset(splits['train'] + splits.get('aug', [])), splits['dev'], args=args) if args.resume: m.load_save(fname=args.resume) elif args.resumes: m.average_saves(args.resumes) if args.interactive_eval: dev_preds = m.run_interactive_pred(splits['dev'], args, verbose=True) else: dev_preds = m.run_pred(splits['dev'], args, verbose=True) if args.write_test_pred: with open(args.write_test_pred, 'wt') as f: json.dump(dev_preds, f, indent=2) print('saved test preds to {}'.format(args.write_test_pred)) pprint.pprint(m.compute_metrics(splits['dev'], dev_preds))
def test_dataset_conversion(self): ds = dataset.Dataset() ds.readFromFile('../data/tiny.dat') vds = dataset.VerticalDataset() vds.readFromDataset(ds) ds2 = dataset.Dataset() ds2.readFromDataset(vds) self.assertEqual(ds.rows, ds2.rows) ds.readFromFile('../data/chess_tiny.dat') vds = dataset.VerticalDataset() vds.readFromDataset(ds) ds2 = dataset.Dataset() ds2.readFromDataset(vds) self.assertEqual(ds.rows, ds2.rows) ds.readFromFile('../data/chess.dat') vds = dataset.VerticalDataset() vds.readFromDataset(ds) ds2 = dataset.Dataset() ds2.readFromDataset(vds) self.assertEqual(ds.rows, ds2.rows)
def run(): path = datautils.Path('../input/train_data') image_files = datautils.get_images(path) train_paths, valid_paths = train_test_split(image_files, test_size=config.VALID_SPLIT, random_state=42) print(len(train_paths), len(valid_paths)) encoder = joblib.load('label_encoder.pkl') train_ds = dataset.Dataset(train_paths, get_labels=datautils.get_label, label_enc=encoder, size=(1200, 600)) num_classes = len(encoder.classes_) train_dl = torch.utils.data.DataLoader(train_ds, batch_size=config.TRAIN_BATCH_SIZE, num_workers=8, shuffle=True) valid_ds = dataset.Dataset(valid_paths, get_labels=datautils.get_label, label_enc=encoder, size=(1200, 600)) valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=config.TRAIN_BATCH_SIZE * 2, num_workers=8, shuffle=False) ocr_model = model.Model(len(encoder.classes_)) ocr_model.to(config.DEVICE) total_steps = len(train_dl) * config.N_EPOCHS opt = torch.optim.Adam(ocr_model.parameters(), config.MAX_LR) scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, config.MAX_LR, total_steps=total_steps) for epoch in range(config.N_EPOCHS): engine.train_loop(train_dl, ocr_model, opt, scheduler, None, config.DEVICE) losses, output = engine.eval_loop(valid_dl, ocr_model, None, config.DEVICE) print(torch.tensor(losses).mean().item()) save_dict = { 'label_encoding': encoder, 'model_dict': ocr_model.state_dict() } torch.save(save_dict, f'ocr_model_{config.N_EPOCHS}')
def readcspfile(self): try: import dataset #NEEDED for reading and writing cryosparc cs files except: response = messagebox.showerror( "ERROR", "Cannot import required cryosparc library") else: self.particleset = dataset.Dataset().from_file(self.cspfile.get()) self.passthruset = dataset.Dataset().from_file( self.passthrufile.get()) if 'ctf/exp_group_id' in self.passthruset.data.keys(): self.particleset = self.passthruset response = messagebox.showinfo( "IMPORTANT", "CTF information is in the passthrough file.\n Replace that file in cryosparc directory" ) elif 'ctf/exp_group_id' in self.particleset.data.keys(): response = messagebox.showinfo( "IMPORTANT", "CTF information is in the particleset file.\n Replace that file in cryosparc directory" ) else: response = messagebox.showerror( "ERROR", "No CTF information found! Cannot group") groupdata = {} for i, dataline in enumerate(self.stardata): filename = os.path.basename(dataline[self.micnameindex]) groupdata[filename] = self.grouplabels[i] keys = groupdata.keys() if 'location/micrograph_path' in self.passthruset.data.keys(): numbad = 0 for i in range(len(self.particleset.data)): filename = self.passthruset.data[ 'location/micrograph_path'][i] basename = os.path.basename(filename) if basename in keys: self.particleset.data['ctf/exp_group_id'][ i] = groupdata[basename] else: #no appion ctf data for this micrograph, put them all in their own tilt group #print ("error, no key found for " + basename) self.particleset.data['ctf/exp_group_id'][i] = len( self.grouplabels) + 1 numbad += 1 if (numbad > 0): response = messagebox.showwarning( "WARNING", "Number of particles without good appion ctf data:\n" + str(numbad)) else: response = messagebox.showerror( "ERROR", "No Micrograph filename information found! Cannot group")
def setUp(self): self.input_labels = ['label1', 'label2'] self.samples = [ { 'label1': 1.0, 'label2': 2.0, 'output': 3.0, 'ignored': 1.0 }, { 'label1': 4.0, 'label2': 5.0, 'output': 6.0, 'ignored': 1.0 }, { 'label1': 7.0, 'label2': 8.0, 'output': 9.0, 'ignored': 1.0 }, ] self.samples_with_strings = [ { 'label1': 1.0, 'label2': 'foo', 'output': 3.0, 'ignored': 1.0 }, { 'label1': 4.0, 'label2': 'foo', 'output': 6.0, 'ignored': 1.0 }, { 'label1': 7.0, 'label2': '', 'output': 9.0, 'ignored': 1.0 }, ] self.output_generators = collections.OrderedDict([ ('times10', lambda x: 10.0 * x['output']), ('filterGt3', lambda x: -1 if x['output'] > 3.0 else 42.0) ]) self.dataset = dataset.Dataset(self.samples, self.input_labels, self.output_generators) self.dataset_with_strings = dataset.Dataset(self.samples_with_strings, self.input_labels, self.output_generators)
def __init__(self): self.ds_train = dataset.Dataset(config.trainPath, config.dictPath) self.ds_valid = dataset.Dataset(config.validPath, config.dictPath) self.ds_test = dataset.Dataset(config.testPath, config.dictPath) vocSize = len(self.ds_train.word2id) maxSeqLen = max([len(idLine) for idLine in self.ds_train.idData]) padId = self.ds_train.word2id['<PAD>'] self.model = lm.LM(vocSize, maxSeqLen, padId) # vocSize+1 for padding indice self.loss_log = logger.Logger('../result/loss.log') self.eval_log = logger.Logger('../result/eval.log')
def k_nearest(filters, n_neighbors, weights, verbose=False): """K nearest neighbors classifier experiment.""" # Dataset folder operations if verbose: print('Getting dataset images paths...') paths = dataset.get_images_paths('../data/training/*.jpg') # Loading dataset if verbose: print('Loading images dataset...') input_data = dataset.Dataset(paths, filters=filters, use_mean=True) # Generating sintetic images if verbose: print('Generating sintetic dataset...') input_data.generate_sintetic_dataset() # Extract labels if verbose: print('Extracting labels...') labels = input_data.labels_array() # Input data matrix if verbose: print('Generating data matrix...') data_matrix = input_data.compute_data_matrix() # Compute PCA if verbose: print('Making principal component analysis...') pca = PCA(n_components=25) pca.fit_transform(data_matrix, y=labels) # Instantiate random forest classifier model if verbose: print('Instantiating model...') classifier = KNeighborsClassifier(n_jobs=-1, n_neighbors=n_neighbors, weights=weights) # Training random forest classifier model and computing score if verbose: print('Training and computing score...') shuffle_split = ShuffleSplit(n_splits=15, test_size=0.15, train_size=0.85, random_state=0) scores = cross_val_score(estimator=classifier, X=data_matrix, y=labels, cv=shuffle_split, n_jobs=-1) # Reporting results mean_score = np.mean(scores) std_deviation = np.std(scores) f = mean_score - std_deviation print('\nMean score: ', mean_score) print('Standard deviation: ', std_deviation) print('F: ', f)
def get_track_dataset( self, name, src_ibr_dir, tgt_ibr_dir, n_nbs, im_size=None, pad_width=16, patch=None, nbs_mode="argmax", train=False, ): logging.info(f" create dataset for {name}") src_im_paths = sorted(src_ibr_dir.glob(f"im_*.png")) src_im_paths += sorted(src_ibr_dir.glob(f"im_*.jpg")) src_im_paths += sorted(src_ibr_dir.glob(f"im_*.jpeg")) src_dm_paths = sorted(src_ibr_dir.glob("dm_*.npy")) src_Ks = np.load(src_ibr_dir / "Ks.npy") src_Rs = np.load(src_ibr_dir / "Rs.npy") src_ts = np.load(src_ibr_dir / "ts.npy") tgt_im_paths = sorted(tgt_ibr_dir.glob(f"im_*.png")) tgt_im_paths += sorted(tgt_ibr_dir.glob(f"im_*.jpg")) tgt_im_paths += sorted(tgt_ibr_dir.glob(f"im_*.jpeg")) if len(tgt_im_paths) == 0: tgt_im_paths = None tgt_dm_paths = sorted(tgt_ibr_dir.glob("dm_*.npy")) count_paths = sorted(tgt_ibr_dir.glob("count_*.npy")) counts = [] for count_path in count_paths: counts.append(np.load(count_path)) counts = np.array(counts) tgt_Ks = np.load(tgt_ibr_dir / "Ks.npy") tgt_Rs = np.load(tgt_ibr_dir / "Rs.npy") tgt_ts = np.load(tgt_ibr_dir / "ts.npy") dset = dataset.Dataset( name=name, tgt_im_paths=tgt_im_paths, tgt_dm_paths=tgt_dm_paths, tgt_Ks=tgt_Ks, tgt_Rs=tgt_Rs, tgt_ts=tgt_ts, tgt_counts=counts, src_im_paths=src_im_paths, src_dm_paths=src_dm_paths, src_Ks=src_Ks, src_Rs=src_Rs, src_ts=src_ts, im_size=im_size, pad_width=pad_width, patch=patch, n_nbs=n_nbs, nbs_mode=nbs_mode, bwd_depth_thresh=self.bwd_depth_thresh, invalid_depth_to_inf=self.invalid_depth_to_inf, train=train, ) return dset
def setUp(self): instances = [dataset.Instance(0, {0:.2, 1:1.0}, label=-1.0), dataset.Instance(1, {0:.2, 1:.7}, label=-1.0), dataset.Instance(2, {0:.5, 1:.5}, label=1.0), dataset.Instance(3, {0:.7, 1:.7}, label=1.0)] inst_dict = dict(zip(range(4), instances)) self.data = dataset.Dataset(instances=inst_dict)
def save_bottleneck_features(session, network, dataset): def transform(name, x, y): num_batches, batches = batches_mod.make_batches(x, y) transformed_results = [] for batch_idx, (batch_x, _) in enumerate(batches): transformed_results.append( session.run(network.bottleneck_out, feed_dict={network.x: batch_x})) print(f"{name}: {batch_idx}/{num_batches}") if batch_idx == 10: break return np.concatenate(transformed_results, axis=0) transform_train_x = transform("train_x", dataset.train_x, dataset.train_y) transform_valid_x = transform("valid_x", dataset.valid_x, dataset.valid_y) fname = f"../data/bottleneck_{network.name}_{dataset.name}.p" with open(fname, "wb") as f: pickle.dump( dataset_mod.Dataset(train_x=transform_train_x, train_y=dataset.train_y, valid_x=transform_valid_x, valid_y=dataset.valid_y, num_classes=dataset.num_classes, name=dataset.name), f)
def waveform_capture_dataset(self): if len(self.capturedDataBuffer) > 0: self.signalsNames, self.wfm_data, self.time_vector = self.capturedDataBuffer[ 0] else: self.ts.log_error('Did not capture data!') ds = dataset.Dataset() masterlist = self.analog_channels + self.digital_channels if len(self.signalsNames) == len(masterlist): ds.points.append('TIME') ds.data.append(self.time_vector[0::self.subsampling_rate]) chan_count = 0 for c in masterlist: ds.points.append(wfm_typhoon_channels[c]) ds.data.append( self.wfm_data[chan_count][0::self.subsampling_rate]) chan_count += 1 else: self.ts.log_error( 'Number of channels returned from waveform capture is unexpected. ' 'Expected %s. Got: %s' % (self.channelSettings, self.signalsNames)) return ds
def test_fit_complex(self): tf.set_random_seed(1) np.random.seed(1) flags = testing_flags.FLAGS flags.input_dim = 1 flags.hidden_dim = 32 flags.num_hidden_layers = 2 flags.output_dim = 1 flags.batch_size = 32 flags.num_epochs = 8 flags.learning_rate = .002 flags.l2_reg = 0.0 flags.verbose = False flags.save_weights_every = 100000 flags.snapshot_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, 'data', 'snapshots', 'test')) x = np.random.randn(1000).reshape(-1, 1) y = np.ones(x.shape) y[x < 0] = 0 data = {'x_train': x, 'y_train': y, 'x_val': x, 'y_val': y} d = dataset.Dataset(data, flags) with tf.Session() as session: network = ffnn.FeedForwardNeuralNetwork(session, flags) network.fit(d) actual = network.predict(x) actual[actual < .5] = 0 actual[actual >= .5] = 1 np.testing.assert_array_almost_equal(y, actual, 8)
def main(): # Basic set patchsize = 40 # Load networks DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_obj = OBJ_CNN().to(DEVICE) model_obj.load_state_dict( torch.load('./Model parameter/fire/obj_model_init.pkl')) # Traning path setting dataDir = './' trainingDir = dataDir + 'training/' # If u want to do experiment on test dataset, change it trainingSets = util.getSubPaths(trainingDir) # Initialize training datasets trainingDataset = dataset.Dataset() trainingDataset.readFileNames(trainingSets[0]) trainingDataset.SetObjID(1) i = 523 # If u want to do experiment on other RGB image, change it imgBGR = trainingDataset.getBGR(i) # Do every pixel sample sample = [[idx % 640, idx // 640] for idx in range(480 * 640)] sample_with_patch = [idx for idx in sample if patchsize / 2 <= idx[0] < 640 - patchsize / 2 and \ patchsize / 2 <= idx[1] < 480 - patchsize / 2] sample_with_patch = np.array(sample_with_patch).reshape([440, 600, 2]) pred_coord = cnn.getCoordImg(colorData=imgBGR, sampling=sample_with_patch, patchsize=patchsize, model=model_obj) / 1000.0 return pred_coord
def data_capture(self, enable=True, channels=None): """ Enable/disable data capture. If sample_interval == 0, there will be no autonomous data captures and self.data_sample should be used to add data points to the capture """ if enable is True: if self._capture is False: self._ds = dataset.Dataset(self.data_points) self._last_datarec = [] if self.sample_interval > 0: if self.sample_interval < MINIMUM_SAMPLE_PERIOD: raise DASError('Sample period too small: %s' % (self.sample_interval)) self._timer = self.ts.timer_start( float(self.sample_interval) / 1000, self._timer_timeout, repeating=True) self._capture = True elif enable is False: if self._capture is True: if self._timer is not None: self.ts.timer_cancel(self._timer) self._timer = None self._capture = False self.device.data_capture(enable)
def main(): print("loading data...") ds = dataset.Dataset(classes=classes) train_X, train_y = ds.load_data('train') train_X = ds.preprocess_inputs(train_X) train_Y = ds.reshape_labels(train_y) print("input data shape...", train_X.shape) print("input label shape...", train_Y.shape) test_X, test_y = ds.load_data('test') test_X = ds.preprocess_inputs(test_X) test_Y = ds.reshape_labels(test_y) print("creating model...") model = SegNet(input_shape=input_shape, classes=classes) model.compile(loss="categorical_crossentropy", optimizer='adadelta', metrics=["accuracy"]) model.fit(train_X, train_Y, batch_size=batch_size, epochs=epochs, verbose=1, class_weight=class_weighting, validation_data=(test_X, test_Y), shuffle=True) model.save('seg.h5')
def read_func_data(self, func_lst_in_loop): # ------------start(retriev the target function data)------------------------ function_data_file = func_lst_in_loop[0] + ".dat" function_data_path = os.path.join(self.output_dir, function_data_file) # result_path = os.path.join(self.output_dir, 'data_batch_result.pkl') if os.path.exists(function_data_path): with open(function_data_path, 'r') as f: data_batch = pickle.load(f) print('read the function data !!! ... %s' % function_data_path) else: my_data = dataset.Dataset(self.data_folder, func_lst_in_loop, self.embed_path, self.process_num, self.embed_dim, self.num_classes, self.tag, self.int2insn_path) data_batch = my_data.get_batch(batch_size=self.batch_size) with open(function_data_path, 'w') as f: pickle.dump(data_batch, f) print('Save the function_data_path !!! ... %s' % function_data_path) # *******start(used to predict the label of this data_batch)******** # keep_prob = 1.0 # feed_batch_dict1 = { # 'data': data_batch['data'], # 'label': data_batch['label'], # 'length': data_batch['length'], # 'keep_prob_pl': keep_prob # } # print "type of feed_batch_dict1['data']", type(feed_batch_dict1['data']) # print "len of feed_batch_dict1['data']", len(feed_batch_dict1['data']) # print "data of feed_batch_dict1['data']", feed_batch_dict1['data'] # eval_predict.main(feed_batch_dict1) # ******* end (used to predict the label of this data_batch)******** # ------------ end (retriev the target function data)------------------------ return data_batch
def evaluate(args): """ Evaluate the classification model """ logger = logging.getLogger("alibaba") logger.info("Load data_set , vocab and label config...") if args.pretrained_embedding: word_vocab_ = PretrainedVocab(args) else: with open(os.path.join(args.vocab_dir, "vocab.data"), "rb") as fin: word_vocab_ = pickle.load(fin) with open(os.path.join(args.vocab_dir, "vocab_character.data"), "rb") as fin: vocab_character_ = pickle.load(fin) data = dataset.Dataset(args) logger.info("Convert word to id...") data.convert_to_ids(word_vocab_, set_name='test') logger.info("Convert character to id...") data.convert_to_ids(vocab_character_, character=True, set_name='test') logger.info("Build Model...") model_ = model.Model(args, word_vocab=word_vocab_, character_vocab=vocab_character_) model_.restore(model_dir=args.model_dir, model_prefix=args.class_model) logger.info("Evaluating the model on dev set...") dev_batchs = data.get_mini_batchs(batch_size=args.batch_size, set_name="test", predict=True) _ = model_.predictiton( batch_data=dev_batchs, result_file=args.result_file, save_predict_label=True, ) logger.info("Predicted labels are saved to {}".format(args.result_file))
def begin(self, edge, strategy): data = dataset.Dataset('data/data.csv') self.train_results_len = int(TRAINING_SET_SPLIT * len(data.processed_results)) self.train_results = data.processed_results[:self.train_results_len] self.test_results = data.processed_results[self.train_results_len:] self.test_results_info = [[] for i in range(7)] for i in range(3): self.test_results_info[i] = data.result_info[i][self. train_results_len:] def map_results(results): features = {} for result in results: for key in result.keys(): if key not in features: features[key] = [] features[key].append(result[key]) for key in features.keys(): features[key] = np.array(features[key]) return features, features['result'] self.train_features, self.train_labels = map_results( self.train_results) self.test_features, self.test_labels = map_results(self.test_results) return self.learn(edge, strategy)
def setup(self, path): """Prepare experiment. Args: path (str): path to experiment configuration file. Returns: None. """ if self.verbose: print('\n-- Starting experiment') self.config = self.read_config_file(path) training_paths, test_paths = dataset.get_images_paths( self.config['training_path'], self.config['dataset_mode']) if self.verbose: print('\n-- Loading dataset \n') self.dataset = dataset.Dataset(paths=training_paths, config=self.config, verbose=self.verbose) self.labels = self.dataset.labels_array() self.data_matrix = self.dataset.compute_data_matrix() if self.verbose: print('\n-- Making principal component analysis') pca = PCA(int(self.config['n_components'])) pca.fit_transform(self.data_matrix, y=self.labels)
def test_fit_basic(self): """ Description: - overfit a debug dataset using the 'fit' method """ tf.set_random_seed(1) np.random.seed(1) flags = testing_flags.FLAGS flags.input_dim = 3 flags.hidden_dim = 32 flags.num_hidden_layers = 2 flags.output_dim = 2 flags.batch_size = 16 flags.num_epochs = 200 flags.learning_rate = .05 flags.l2_reg = 0.0 flags.save_weights_every = 100000 flags.snapshot_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), os.pardir, 'data', 'snapshots', 'test')) x = np.vstack((np.ones( (flags.batch_size // 2, flags.input_dim)), -1 * np.ones( (flags.batch_size // 2, flags.input_dim)))) y = np.vstack((np.zeros((flags.batch_size // 2, flags.output_dim)), np.ones((flags.batch_size // 2, flags.output_dim)))) data = {'x_train': x, 'y_train': y, 'x_val': x, 'y_val': y} d = dataset.Dataset(data, flags) with tf.Session() as session: network = ffnn.FeedForwardNeuralNetwork(session, flags) network.fit(d) actual = network.predict(x) np.testing.assert_array_almost_equal(y, actual, 8)
def train(args): """ Training the classification model """ logger = logging.getLogger("alibaba") logger.info("Load data_set , vocab and label config...") if args.pretrained_embedding: word_vocab_ = PretrainedVocab(args) else: with open(os.path.join(args.vocab_dir, "vocab.data"), "rb") as fin: word_vocab_ = pickle.load(fin) with open(os.path.join(args.vocab_dir, "vocab_character.data"), "rb") as fin: vocab_character_ = pickle.load(fin) data = dataset.Dataset(args) logger.info("Convert word to id...") data.convert_to_ids(word_vocab_) logger.info("Convert character to id...") data.convert_to_ids(vocab_character_, character=True) logger.info("Build Model...") model_ = model.Model(args, word_vocab=word_vocab_, character_vocab=vocab_character_) logger.info("Training the model...") model_.train( data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.class_model, ) logger.info("Done with training...")
def err_analyze(dst, mat, twtf, plcf, col): """output csv for mat""" twt_lst = dataset.Dataset() with open(twtf) as ftwt: for line in ftwt: twt_lst.append(json.loads(line)) places = dataset.DataItem() with open(plcf) as fplc: for line in fplc: place = json.loads(line) places[place[col]] = place with open(dst, 'w') as fdst: print >>fdst, '"Ref POI", "Hyp POI", "Text", "Ref Genre", "Hyp Genre", "Ref SGenre", "Hyp SGenre"' for i in mat: for j in mat: #if i != j: for item in mat[i][j]: # ref hyp text rcat hcat rsc hsc try: print >>fdst, '"{0}","{1}","{2}","{3}","{4}","{5}","{6}"' \ .format(csv_filter(places[i]['name']),csv_filter(places[j]['name']), \ fourq_filter(csv_filter(twt_lst[item]['text'])), \ places[i]['category'],places[j]['category'], \ places[i]['super_category'], places[j]['super_category']) except: pass
def __init__(self, config): self.config = config self.label = config.label self.path = config.get_result_path() self.dataset = dataset.Dataset(config.get_dataset_filename()) self.dataset.load() self.df = self.load_results() self.data = self.load_data() self.selected_features = self.load_selected_features() weight_filename = self.dataset.get_target_filename(config.weight) self.weight = np.memmap(weight_filename, 'float32') target_filename = self.dataset.get_target_filename(config.target) self.target = np.memmap(target_filename, 'float32') train_data_filename = self.dataset.get_train_filename() self.train_data_index = np.memmap(train_data_filename, 'int32') test_data_filename = self.dataset.get_test_filename() self.test_data_index = np.memmap(test_data_filename, 'int32') #if self.config.loss == "gamma": self.test_data_index = np.intersect1d(self.test_data_index, self.weight.nonzero()) self.train_data_index = np.intersect1d(self.train_data_index, self.weight.nonzero()) self.train_data = self.data[self.train_data_index, :] self.test_data = self.data[self.test_data_index, :] self.df_coeffs = self.load_coeffs() self.gini_curve = self.load_gini_curve() self.nb_features = self.gini_curve.shape[0]
def cate_smooth(twt_lst, ratio, sel, lmd): """Smoothing the dataset by place category""" rst_lst = dataset.Dataset() pid_lst = twt_lst.distinct('place_id') twt_dist = twt_lst.groupfunc('place_id', len) tid_set = set(twt_lst.distinct('place_id')) pid_set = set(pid_lst) for pid in pid_lst: plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \ ('id = \'{0}\''.format(pid),), 'place') plc_type = plc[0]['super_category'] tmp_lst = list() cand = dataset.type_random(plc_type) for twt in cand: if twt['id'] not in tid_set and twt['place_id'] not in pid_lst: if sel(twt, plc): twt['place_id'] = pid tid_set.add(twt['id']) pid_set.add(twt['place_id']) tmp_lst.append(twt) if len(tmp_lst) >= ratio * twt_dist[pid]: break rst_lst.extend(tmp_lst) rst_lst.extend(twt_lst) return rst_lst
def benchmark(self, dd): """ This method benchmarks Raha. """ d = dataset.Dataset(dd) sampling_range = [self.LABELING_BUDGET] aggregate_results = {s: [] for s in sampling_range} for r in range(self.RUN_COUNT): print("Run {}...".format(r)) for s in sampling_range: self.LABELING_BUDGET = s correction_dictionary = self.run(dd) er = d.get_data_cleaning_evaluation(correction_dictionary)[:3] aggregate_results[s].append(er) results_string = "\\addplot[error bars/.cd,y dir=both,y explicit] coordinates{(0,0.0)" for s in sampling_range: mean = numpy.mean(numpy.array(aggregate_results[s]), axis=0) std = numpy.std(numpy.array(aggregate_results[s]), axis=0) print("Raha on {}".format(d.name)) print("Labeled Tuples Count = {}".format(s)) print("Precision = {:.2f} +- {:.2f}".format(mean[0], std[0])) print("Recall = {:.2f} +- {:.2f}".format(mean[1], std[1])) print("F1 = {:.2f} +- {:.2f}".format(mean[2], std[2])) print("--------------------") results_string += "({},{:.2f})+-(0,{:.2f})".format(s, mean[2], std[2]) results_string += "}; \\addlegendentry{Raha}" print(results_string)
def load(view): """Load the UIUC Car Dataset""" assert view in {"train", "test"} folder = "CarData" if view == "train": positive_image_filenames = [ "%s/TrainImages/pos-%d.pgm" % (folder, i) for i in xrange(550) # specified by readme ] negative_image_filenames = [ "%s/TrainImages/neg-%d.pgm" % (folder, i) for i in xrange(500) # specified by readme ] records = [ dataset.DatasetRecord(filename, None, "positive") for filename in positive_image_filenames ] + [ dataset.DatasetRecord(filename, None, "negative") for filename in negative_image_filenames ] return dataset.Dataset(_name="UIUC Cars", _folder=folder, _records=records)
def getColumnDataset(self, seriesNumber, column, error=None, errorFn=None, autoLabel=True, name=None, units=None): df = self.series[seriesNumber] col = df[df.columns[column]].dropna() if autoLabel: if name is not None or units is not None: _warnings.warn( 'autoLabel selected and manual name/units paramenters set. Defaulting to manual name parameters where available.' ) n, u = self.parseColumnName(df.columns[column]) name = n if name is None else name units = u if units is None else units return _ds.Dataset( _np.array([ s.replace(self.decimal, '.') if isinstance(s, str) else s for s in col ], 'float64'), error, errorFn, name, units)
def main(argv=None): # custom parse of flags for list input compression_flags.custom_parse_flags(FLAGS) # set random seeds np.random.seed(FLAGS.random_seed) tf.set_random_seed(FLAGS.random_seed) # load dataset input_filepath = FLAGS.dataset_filepath data = dataset_loaders.risk_dataset_loader( input_filepath, shuffle=True, train_split=.9, debug_size=FLAGS.debug_size, timesteps=FLAGS.timesteps, num_target_bins=FLAGS.num_target_bins, balanced_class_loss=FLAGS.balanced_class_loss, target_index=FLAGS.target_index) if FLAGS.use_priority: d = priority_dataset.PrioritizedDataset(data, FLAGS) else: if FLAGS.balanced_class_loss: d = dataset.WeightedDataset(data, FLAGS) else: d = dataset.Dataset(data, FLAGS) print('means:\n{}\n{}'.format(np.mean(d.data['y_train'], axis=0), np.mean(d.data['y_val'], axis=0))) y = copy.deepcopy(d.data['y_val']) y[y == 0.] = 1e-8 y[y == 1.] = 1 - 1e-8 compression_metrics.regression_score(y, np.mean(y, axis=0), 'baseline') compression_metrics.regression_score(y, y, 'correct') # fit the model with tf.Session(config=tf.ConfigProto( log_device_placement=False)) as session: # if the timestep dimension is > 1, use recurrent network if FLAGS.timesteps > 1: network = rnn.RecurrentNeuralNetwork(session, FLAGS) else: if FLAGS.task_type == 'classification': if FLAGS.balanced_class_loss: network = ffnn.WeightedClassificationFeedForwardNeuralNetwork( session, FLAGS) else: network = ffnn.ClassificationFeedForwardNeuralNetwork( session, FLAGS) else: network = ffnn.FeedForwardNeuralNetwork(session, FLAGS) network.fit(d) # save weights to a julia-compatible weight file neural_networks.utils.save_trainable_variables( FLAGS.julia_weights_filepath, session, data) # evaluate the fit compression_metrics.evaluate_fit(network, data, FLAGS)
def __init__(self, reg=0.0025, learning_rate=0.05, annealing=1., init_sigma=1, k=32, **kwargs): self.name = 'FM' self.dataset = dataset.Dataset() self.feature_dim = self.dataset.num_users + 2 * self.dataset.num_items + self.dataset.item_feature_dim self.reg = reg self.learning_rate = learning_rate # self.learning_rate will change due to annealing. self.init_learning_rate = learning_rate # self.init_learning_rate keeps the original value (for filename) self.annealing_rate = annealing self.init_sigma = init_sigma self.metrics = { 'recall': { 'direction': 1 }, 'precision': { 'direction': 1 }, 'user_coverage': { 'direction': 1 }, 'item_coverage': { 'direction': 1 }, 'ndcg': { 'direction': 1 }, # 'blockbuster_share' : {'direction': -1} } self.k = k #分解的维度