def __init__(self, flags): run_config = tf.ConfigProto() run_config.gpu_options.allow_growth = True self.sess = tf.Session(config=run_config) self.flags = flags self.best_mae = float("inf") self.iter_time = 0 self.train_dataset = dataset(self.flags.dataset) print('train dataset name: {}'.format(self.train_dataset.dataset_name)) if self.flags.dataset == 'brain01': self.val_dataset = dataset('brain05') elif self.flags.dataset == 'spine04': self.val_dataset = dataset('spine_val') print('val datset name: {}'.format(self.val_dataset.dataset_name)) self.model = gan_repository(self.sess, self.flags, self.train_dataset) self._make_folders() self.evaluator = None self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) # threads for tfrecrod self.coord = tf.train.Coordinator() self.threads = tf.train.start_queue_runners(sess=self.sess, coord=self.coord)
def __init__(self, train_file, test_file): self.train_data = dataset(train_file, 'train') self.test_data = dataset(test_file, 'test') ''' texts = self.train_data.texts + self.test_data.texts targets = self.train_data.targets + self.test_data.targets self.train_data.texts, self.test_data.texts, self.train_data.targets, self.test_data.targets = train_test_split( texts, targets, test_size=0.2 ) self.train_data.size = len(self.train_data.targets) self.test_data.size = len(self.test_data.targets) ''' self.encoder = Encoder() self.encoder.build_vocab(self.train_data.texts + self.test_data.texts) self.vocab_size = self.encoder.vocab_size self.feature_size = self.vocab_size * n_label print('feature count:', self.feature_size) self.timemark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) self.v = get_matrix(n_label, self.vocab_size) print('model built')
def build_data(self): if self.opt['process_data']: self.train_dataset = dataset( "../../data/data1030/output/train_cut.pkl", self.opt, 'train') self.valid_dataset = dataset( "../../data/data1030/output/valid_cut.pkl", self.opt, 'valid') self.test_dataset = dataset( "../../data/data1030/output/test_cut.pkl", self.opt, 'test') self.train_processed_set = self.train_dataset.data_process(True) self.valid_processed_set = self.valid_dataset.data_process(True) self.test_processed_set = self.test_dataset.data_process(True) pickle.dump(self.train_processed_set, open('data/train_processed_set.pkl', 'wb')) pickle.dump(self.valid_processed_set, open('data/valid_processed_set.pkl', 'wb')) pickle.dump(self.test_processed_set, open('data/test_processed_set.pkl', 'wb')) logger.info("[Save processed data]") else: try: self.train_processed_set = pickle.load( open('data/train_processed_set.pkl', 'rb')) self.valid_processed_set = pickle.load( open('data/valid_processed_set.pkl', 'rb')) self.test_processed_set = pickle.load( open('data/test_processed_set.pkl', 'rb')) except: assert 1 == 0, "No processed data" logger.info("[Load processed data]")
def val(self,is_test=False): self.metrics_gen={"ppl":0,"dist1":0,"dist2":0,"dist3":0,"dist4":0,"bleu1":0,"bleu2":0,"bleu3":0,"bleu4":0,"count":0} self.metrics_rec={"recall@1":0,"recall@10":0,"recall@50":0,"loss":0,"gate":0,"count":0,'gate_count':0} self.model.eval() if is_test: val_dataset = dataset('data/test_data.jsonl', self.opt) else: val_dataset = dataset('data/valid_data.jsonl', self.opt) val_set=CRSdataset(val_dataset.data_process(),self.opt['n_entity'],self.opt['n_concept']) val_dataset_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=self.batch_size, shuffle=False) recs=[] for context, c_lengths, response, r_length, mask_response, mask_r_length, entity, entity_vector, movie, concept_mask, dbpedia_mask, concept_vec, db_vec, rec in tqdm(val_dataset_loader): with torch.no_grad(): seed_sets = [] batch_size = context.shape[0] for b in range(batch_size): seed_set = entity[b].nonzero().view(-1).tolist() seed_sets.append(seed_set) scores, preds, rec_scores, rec_loss, _, mask_loss, info_db_loss, info_con_loss = self.model(context.cuda(), response.cuda(), mask_response.cuda(), concept_mask, dbpedia_mask, seed_sets, movie, concept_vec, db_vec, entity_vector.cuda(), rec, test=True, maxlen=20, bsz=batch_size) recs.extend(rec.cpu()) #print(losses) #exit() self.metrics_cal_rec(rec_loss, rec_scores, movie) output_dict_rec={key: self.metrics_rec[key] / self.metrics_rec['count'] for key in self.metrics_rec} print(output_dict_rec) return output_dict_rec
def __init__(self, input_dim, hid_dim, class_num, d1, lrn_rate, momentum, batch_size_train, epoch_max, reg_lambda, train_file_name, val_file_name, test_file_name, log_file_name_head, gaus_train_file_name, gaus_val_file_name, gaus_test_file_name, attr_train_file_name, attr_val_file_name, attr_test_file_name, write_model_log_period): self.input_dim = input_dim self.hid_dim = hid_dim self.class_num = class_num self.d1 = d1 self.lrn_rate = lrn_rate self.momentum = momentum self.batch_size_train = batch_size_train self.epoch_max = epoch_max self.reg_lambda = reg_lambda self.log_file_name_head = log_file_name_head self.write_model_log_period = write_model_log_period self.data = dataset.dataset(train_file_name, val_file_name, test_file_name, class_num, batch_size_train) self.gaus_sample = dataset.dataset(gaus_train_file_name, gaus_val_file_name, gaus_test_file_name, class_num, batch_size_train)
def __init__(self, train_file, test_file): train_data = dataset(train_file, 'train') test_data = dataset(test_file, 'test') texts = train_data.texts + test_data.texts targets = train_data.targets + test_data.targets train_data.texts, test_data.texts, train_data.targets, test_data.targets = train_test_split( texts, targets) train_data.size = len(train_data.targets) test_data.size = len(test_data.targets) tfidf = TfidfVectorizer(max_features=max_features, stop_words=stopwords) tfidf.fit(texts) self.vocab = { value: i for i, value in enumerate(tfidf.get_feature_names()) } self.train_texts = train_data.texts self.train_targets = train_data.targets self.test_texts = test_data.texts self.test_targets = test_data.targets self.train_size = train_data.size self.test_size = test_data.size self.timemark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) self.v = get_matrix(n_label, max_features)
def get_train_data(batch_size, seq_len, dir_type, pixel, pretrained=None, shuffle=True, num_workers=0): train_folder = 'MOT17/train/' datastorageobject = datastorage(train_folder) datastorageobject.prepare(dir_type) datastorageobject.split(seq_len) traindatasetobject = dataset(datastorageobject, seq_len, pixel, pretrained, 'train') traindatasetobject.create() valdatasetobject = dataset(datastorageobject, seq_len, pixel, pretrained, 'val') valdatasetobject.create() traindataloader = data.DataLoader(traindatasetobject, batch_size=batch_size, collate_fn=traindatasetobject.collate_fn, shuffle=shuffle, num_workers=num_workers) valdataloader = data.DataLoader(valdatasetobject, batch_size=batch_size, collate_fn=valdatasetobject.collate_fn, shuffle=shuffle, num_workers=num_workers) return traindataloader, traindatasetobject.__len__( ), valdataloader, valdatasetobject.__len__()
def __init__(self, input_dim, hid_dim, class_num, d1, lrn_rate, momentum, batch_size_train, epoch_max, reg_lambda, train_file_name, val_file_name, test_file_name, log_file_name_head, gaus_train_file_name, gaus_val_file_name, gaus_test_file_name, attr_train_file_name, attr_val_file_name, attr_test_file_name, write_model_log_period, match_coef=1, train_label_file_name=None, val_label_file_name=None, test_label_file_name=None, load_model_file_directory=None): self.input_dim = input_dim self.hid_dim = hid_dim self.class_num = class_num self.d1 = d1 self.d2 = 100 self.lrn_rate = lrn_rate self.momentum = momentum self.batch_size_train = batch_size_train self.batch_size_test = 24295 self.epoch_max = epoch_max self.reg_lambda = reg_lambda self.log_file_name_head = log_file_name_head self.write_model_log_period = write_model_log_period self.match_coef = match_coef self.load_model_file_directory = load_model_file_directory self.data = dataset.dataset( train_file_name, val_file_name, test_file_name, class_num, batch_size_train=batch_size_train, train_label_file_name=train_label_file_name, val_label_file_name=val_label_file_name, test_label_file_name=test_label_file_name) self.gaus_sample = dataset.dataset(gaus_train_file_name, gaus_val_file_name, gaus_test_file_name, class_num, batch_size_train=batch_size_train) self.attrdata = attrdataset.attrdataset(attr_train_file_name, attr_val_file_name, attr_test_file_name)
def __init__(self, data_directory, data_test_directory, n): print data_directory, data_test_directory self.data_directory = data_directory self.n = n X, y, tags = dataset.dataset(self.data_directory, self.n) self.nb_classes = len(tags) if data_test_directory == None: sample_count = len(y) train_size = sample_count * 4 // 5 X_train = X[:train_size] y_train = y[:train_size] X_test = X[train_size:] y_test = y[train_size:] else: X_train = X y_train = y X_test, y_test, test_tags = dataset.dataset(data_test_directory, n) nb_classes_test = len(test_tags) print test_tags print nb_classes_test, self.nb_classes assert nb_classes_test == self.nb_classes Y_train = np_utils.to_categorical(y_train, self.nb_classes) Y_test = np_utils.to_categorical(y_test, self.nb_classes) X_train = [x.reshape(n, n, 3) for x in X_train] X_test = [x.reshape(n, n, 3) for x in X_test] self.datagen = ImageDataGenerator(featurewise_center=False, samplewise_center=False, featurewise_std_normalization=False, samplewise_std_normalization=False, zca_whitening=False, rotation_range=45, width_shift_range=0.25, height_shift_range=0.25, horizontal_flip=True, vertical_flip=True, zoom_range=0.5, channel_shift_range=0.5, fill_mode='nearest') self.X = X self.y = y self.X_train = np.array(X_train) self.X_test = np.array(X_test) self.Y_train = Y_train self.Y_test = Y_test self.y_train = y_train self.y_test = y_test self.tags = tags
def test_parse_dataset(self): print('=== Testing parse_dataset() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder + b'/data/photons.emc') self.photons_tests(dset) dset = dataset.dataset(det) dset.parse_dataset(recon_folder + b'/data/photons.h5') self.photons_tests(dset)
def __init__(self, input_dim, hid_dim, d1, lrn_rate, train_batch_size, epoch_max, momentum=0.0, coef_recon=1.0, coef_gan=1.0, unseen_class_file_name=None, train_file_name=None, val_file_name=None, test_file_name=None, train_label_file_name=None, val_label_file_name=None, test_label_file_name=None, train_attr_file_name=None, val_attr_file_name=None, test_attr_file_name=None, log_file_name_head=None, save_model_period=1, load_model_directory=None, generalizedZSL=False): self.input_dim = input_dim self.hid_dim = hid_dim self.d1 = d1 self.lrn_rate = lrn_rate self.train_batch_size = train_batch_size self.epoch_max = epoch_max self.momentum = momentum self.coef_recon = coef_recon self.coef_gan = coef_gan self.unseen_class = np.load(unseen_class_file_name) self.log_file_name_head = log_file_name_head self.save_model_period = save_model_period self.load_model_directory = load_model_directory self.generalizedZSL = generalizedZSL self.data = dataset.dataset( train_file_name=train_file_name, val_file_name=val_file_name, test_file_name=test_file_name, train_label_file_name=train_label_file_name, val_label_file_name=val_label_file_name, test_label_file_name=test_label_file_name) self.attr_data = dataset.dataset(train_file_name=train_attr_file_name, val_file_name=val_attr_file_name, test_file_name=test_attr_file_name)
def val(self,is_test=False): self.metrics_gen={"ppl":0,"dist1":0,"dist2":0,"dist3":0,"dist4":0,"bleu1":0,"bleu2":0,"bleu3":0,"bleu4":0,"count":0} self.metrics_rec={"recall@1":0,"recall@10":0,"recall@50":0,"loss":0,"gate":0,"count":0,'gate_count':0} self.model.eval() if is_test: val_dataset = dataset('data/test_data.jsonl', self.opt) else: val_dataset = dataset('data/valid_data.jsonl', self.opt) val_set=CRSdataset(val_dataset.data_process(True),self.opt['n_entity'],self.opt['n_concept']) val_dataset_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=self.batch_size, shuffle=False) inference_sum=[] golden_sum=[] context_sum=[] losses=[] recs=[] for context, c_lengths, response, r_length, mask_response, mask_r_length, entity, entity_vector, movie, concept_mask, dbpedia_mask, concept_vec, db_vec, rec in tqdm(val_dataset_loader): with torch.no_grad(): seed_sets = [] batch_size = context.shape[0] for b in range(batch_size): seed_set = entity[b].nonzero().view(-1).tolist() seed_sets.append(seed_set) _, _, _, _, gen_loss, mask_loss, info_db_loss, info_con_loss = self.model(context.cuda(), response.cuda(), mask_response.cuda(), concept_mask, dbpedia_mask, seed_sets, movie, concept_vec, db_vec, entity_vector.cuda(), rec, test=False) scores, preds, rec_scores, rec_loss, _, mask_loss, info_db_loss, info_con_loss = self.model(context.cuda(), response.cuda(), mask_response.cuda(), concept_mask, dbpedia_mask, seed_sets, movie, concept_vec, db_vec, entity_vector.cuda(), rec, test=True, maxlen=20, bsz=batch_size) golden_sum.extend(self.vector2sentence(response.cpu())) inference_sum.extend(self.vector2sentence(preds.cpu())) context_sum.extend(self.vector2sentence(context.cpu())) recs.extend(rec.cpu()) losses.append(torch.mean(gen_loss)) #print(losses) #exit() self.metrics_cal_gen(losses,inference_sum,golden_sum,recs) output_dict_gen={} for key in self.metrics_gen: if 'bleu' in key: output_dict_gen[key]=self.metrics_gen[key]/self.metrics_gen['count'] else: output_dict_gen[key]=self.metrics_gen[key] print(output_dict_gen) f=open('context_test.txt','w',encoding='utf-8') f.writelines([' '.join(sen)+'\n' for sen in context_sum]) f.close() f=open('output_test.txt','w',encoding='utf-8') f.writelines([' '.join(sen)+'\n' for sen in inference_sum]) f.close() return output_dict_gen
def __init__(self, input_dim, attr_dim, disp_dim, lrn_rate, train_batch_size, epoch_max, momentum=0.0, coef_match=1.0, coef_recon=1.0, train_file_name=None, val_file_name=None, test_file_name=None, train_label_file_name=None, val_label_file_name=None, test_label_file_name=None, train_attr_file_name=None, val_attr_file_name=None, test_attr_file_name=None, log_file_name_head=None, save_model_period=1, load_model_directory=None): self.input_dim = input_dim self.attr_dim = attr_dim self.disp_dim = disp_dim self.lrn_rate = lrn_rate self.train_batch_size = train_batch_size self.epoch_max = epoch_max self.momentum = momentum self.coef_match = coef_match self.coef_recon = coef_recon self.log_file_name_head = log_file_name_head self.save_model_period = save_model_period self.load_model_directory = load_model_directory self.data = dataset.dataset( train_file_name=train_file_name, val_file_name=val_file_name, test_file_name=test_file_name, train_label_file_name=train_label_file_name, val_label_file_name=val_label_file_name, test_label_file_name=test_label_file_name) self.attr_data = dataset.dataset(train_file_name=train_attr_file_name, val_file_name=val_attr_file_name, test_file_name=test_attr_file_name)
def test_parse_dataset(self): print('=== Testing parse_dataset() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder+b'/data/photons.emc') self.photons_tests(dset) dset.parse_dataset(recon_folder+b'/data/photons.emc')
def train(): model = MyRNN(units, total_words, embedding_len, input_len) trainloader, testloader = dataset() model.compile(optimizer=optimizer, loss=tf.losses.BinaryCrossentropy(), metrics=['accuracy']) model.fit(trainloader, epochs=epochs, validation_data=testloader)
def main(): """ Begin training the classifiers using gathered training data """ good = 0 bad = 0 for i in range(0, 10): """ The frame classifier to be trained """ svcf = svc_frame() """ Still handler """ s = still() """ Load all the training data from know folder loaction """ d = dataset() """ Create training set """ features = [] labels = [] for i in range(0, d.len()): s.load(d.feature(i)) features.append(s.compress_make_linear()) labels.append(d.is_frame(i)) """ Split for validiation """ features_train, features_test, labels_train, labels_test = \ cross_validation.train_test_split(features, labels, test_size=0.2, random_state=int(time.time())) """ Train """ for i in range(0, len(features_train)): svcf.train(features_train[i], labels_train[i]) """ Validate """ for i in range(0, len(features_test)): if svcf.is_frame(features_test[i]) == labels_test[i]: good += 1 else: bad += 1 print good, bad
def test_generate_blacklist(self): print('=== Testing generate_blacklist() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder+b'/data/photons.emc') dset.generate_blacklist(config_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 0) dset.generate_blacklist(config_fname) blist_fname = recon_folder+b'/data/blacklist.dat' blist = np.zeros(dset.tot_num_data, dtype='u1') blist[:10] = 1 np.savetxt(blist_fname.decode('utf-8'), blist, fmt='%d') config = DragonflyConfig(config_fname) config.modify_entry('emc', 'blacklist_file', 'data/blacklist.dat') dset.generate_blacklist(config_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 10) os.remove(blist_fname) config.remove_entry('emc', 'blacklist_file') config.modify_entry('emc', 'selection', 'odd_only') dset.generate_blacklist(config_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 1500) config.remove_entry('emc', 'selection')
def run(dataset_dir, output_dir): # Get data data_X, data_y, _ = dataset.dataset(dataset_dir, 299) for i, x in enumerate(data_X): output_file = output_dir + "/frame-" + str(i) + ".png" scipy.misc.imsave(output_file, x)
def get_dataset(type, positive_mfcc, negative_mfcc, positive_speakers, negative_speakers, positive_listeners, negative_listeners, max = None, min = None): if type == "both": return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path = positive_listeners , p_speakers_path = positive_speakers, n_listeners_path = negative_listeners, n_speakers_path = negative_speakers ) elif type == "speaker": return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path = None , p_speakers_path = positive_speakers, n_listeners_path = None, n_speakers_path = negative_speakers ) elif type == "listener": return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path = positive_listeners , p_speakers_path = None, n_listeners_path = negative_listeners, n_speakers_path = None ) else: return dataset.dataset(positive_mfcc, negative_mfcc, max = max, min = min, p_listeners_path=None, p_speakers_path=None, n_listeners_path=None, n_speakers_path=None)
def __init__(self, unlabeled_datasets = None, models = None, undersample_before_eval = False): ''' unlabeled_datasets should be either (1) a string pointing to a single data file (e.g., "mydata.txt") or (2) a list of strings pointing to multiple data files that represent the same data with different feature spaces. For more on the data format, consult the doc or see the samples. ''' if isinstance(unlabeled_datasets, str): # then a string, presumably pointing to a single data file, was passed in unlabeled_datasets = [unlabeled_datasets] self.unlabeled_datasets = unlabeled_datasets or [] # initialize empty labeled datasets (i.e., all data is unlabeled to begin with) # note that we give the labeled dataset the same name as the corresponding # unlabeled dataset if unlabeled_datasets is not None: self.labeled_datasets = [dataset.dataset(name=d.name) for d in unlabeled_datasets] self.models = models self.undersample_before_eval = undersample_before_eval self.undersample_function = self.undersample_labeled_datasets if undersample_before_eval else None self.query_function = self.base_q_function # throws exception if not overridden self.name = "Base" self.description = "" self.longer_name = "" # default prediction function; only important if you're aggregating multiple feature spaces (see # cautious_predict function documentation) self.predict_func = self.at_least print "using prediction function: %s" % self.predict_func.__name__ # if this is false, the models will not be rebuilt after each round of active learning self.rebuild_models_at_each_iter = True
def test_pearson_recommendation2(self): # データセットの指定 import dataset no = 3 data = dataset.dataset(no) # テストパラメータの設定 sim_func_name = "pearson" model_type_name = "normalized" cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name) # 類似度テスト user1 = "u1" # target user2 = "u2" user3 = "u3" user4 = "u4" self.assertEqual(cf_model.get_sim(user1, user2), -0.8) self.assertEqual(cf_model.get_sim(user1, user3), 1.0) self.assertEqual(cf_model.get_sim(user1, user4), 0.0) # スコアテスト user = "******" item = "i6" score = cf_model.calc_score(user, item) self.assertEqual(score, 3.0 + (2-(-0.8)) / (abs(1) + abs(-0.8)) ) # 4.555... return
def main(): data = dataset("vertebrate_train_nonoise.csv") data.normalize() x = DBSCAN(data, 2, 3) x.train_DBSCAN()
def test_calc_sum_fact(self): print('=== Testing calc_sum_fact() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder + b'/data/photons.emc') dset.calc_sum_fact() frame = np.zeros(dset.num_pix, dtype='i4') frame[dset.place_ones[dset.ones_accum[0]:dset.ones_accum[0] + dset.ones[0]]] = 1 frame[dset.place_multi[dset.multi_accum[0]:dset.multi_accum[0] + dset.multi[0]]] = dset.count_multi[ dset.multi_accum[0]:dset.multi_accum[0] + dset.multi[0]] self.assertAlmostEqual( np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[0]) frame = np.zeros(dset.num_pix, dtype='i4') frame[dset.place_ones[dset.ones_accum[-1]:dset.ones_accum[-1] + dset.ones[-1]]] = 1 frame[dset.place_multi[dset.multi_accum[-1]:dset.multi_accum[-1] + dset.multi[-1]]] = dset.count_multi[ dset.multi_accum[-1]:dset.multi_accum[-1] + dset.multi[-1]] self.assertAlmostEqual( np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[-1])
def test_generate_blacklist(self): print('=== Testing generate_blacklist() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder + b'/data/photons.emc') dset.generate_blacklist(config_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 0) dset.generate_blacklist(config_fname) blist_fname = recon_folder + b'/data/blacklist.dat' blist = np.zeros(dset.tot_num_data, dtype='u1') blist[:10] = 1 np.savetxt(blist_fname.decode('utf-8'), blist, fmt='%d') config = DragonflyConfig(config_fname) config.modify_entry('emc', 'blacklist_file', 'data/blacklist.dat') dset.generate_blacklist(config_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 10) os.remove(blist_fname) config.remove_entry('emc', 'blacklist_file') config.modify_entry('emc', 'selection', 'odd_only') dset.generate_blacklist(config_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 1500) config.remove_entry('emc', 'selection')
def main(): img_paths, annos = dataset(ANNO_DIR, IMG_DIR) idxs = random.sample(range(len(annos)), 4) new_image, new_annos = mosaic(img_paths, annos, idxs, OUTPUT_SIZE, SCALE_RANGE, filter_scale=FILTER_TINY_SCALE) cv2.imwrite('output.jpg', new_image) #The mosaic image for anno in new_annos: start_point = (int(anno[1] * OUTPUT_SIZE[1]), int(anno[2] * OUTPUT_SIZE[0])) end_point = (int(anno[3] * OUTPUT_SIZE[1]), int(anno[4] * OUTPUT_SIZE[0])) cv2.rectangle(new_image, start_point, end_point, (0, 255, 0), 1, cv2.LINE_AA) cv2.imwrite('output_box.jpg', new_image) # The mosaic image with the bounding boxes yolo_anno = [] for anno in up_annos: tmp = [] tmp.append(anno[0]) tmp.append((anno[3]+anno[1])/2) tmp.append((anno[4]+anno[2])/2) tmp.append(anno[3]-anno[1]) tmp.append(anno[4]-anno[2]) yolo_anno.append(tmp) with open('output.txt', 'w') as file: # The output annotation file will appear in the output.txt file for line in yolo_anno: file.write((' ').join([str(x) for x in line]) + '\n')
def test(args): """ Predict on a test dataset given a model. :param args: """ device = get_device(args) net = seq2seq.load(args['m']) dts = dataset(device) dts.read_training_dataset(args['train_data']) test = test_dataset(device, words_converter=dts.words_converter, slots_converter=dts.slots_converter, intent_converter=dts.intent_converter) if args['E'] != None: test.read_test_dataset(args['test_data'], lock=False) embeddings = gensim.models.KeyedVectors.load_word2vec_format( args['E'], binary=True) net.pretrained_embeddings(test, embeddings) else: test.read_test_dataset(args['test_data'], lock=True) print(dts.intent_converter.no_entries()) # # predict! intent_pred, slots_pred = net.predict_batch(test, args['b']) predictions2json(test, intent_pred, slots_pred, args['O'])
def undersample_labeled_datasets(self, k=None): ''' Undersamples the current labeled datasets, i.e., makes the two classes of equal sizes. Note that this methods returns a *copy* of the undersampled datasets. Thus it *does not mutate the labeled datasets*. ''' if self.labeled_datasets and len(self.labeled_datasets) and (len(self.labeled_datasets[0].instances)): if not k: print "undersampling majority class to equal that of the minority examples" # we have to include 'false' minorities -- i.e., instances we've assumed are positives -- because otherwise we'd be cheating k = self.labeled_datasets[0].number_of_majority_examples() - self.labeled_datasets[0].number_of_minority_examples() # we copy the datasets rather than mutate the class members. copied_datasets = [dataset.dataset(list(d.instances)) for d in self.labeled_datasets] if k < self.labeled_datasets[0].number_of_majority_examples() and k > 0: # make sure we have enough majority examples... print "removing %s majority instances. there are %s total majority examples in the dataset." % (k, self.labeled_datasets[0].number_of_majority_examples()) removed_instances = copied_datasets[0].undersample(k) # get the removed instance numbers removed_instance_nums = [inst.id for inst in removed_instances] # if there is more than one feature-space, remove the same instances from the remaining spaces (sets) for labeled_dataset in copied_datasets[1:]: # now remove them from the corresponding sets labeled_dataset.remove_instances(removed_instance_nums) else: raise Exception, "No labeled data has been provided!" return copied_datasets
def predict(model_path, dataset_dir): # load tf graph tf_model, tf_input, tf_output = load_graph(model_path) # Create tensors for model input and output x = tf_model.get_tensor_by_name(tf_input) y = tf_model.get_tensor_by_name(tf_output) # Get data data_X, data_y, _ = dataset.dataset(dataset_dir, 299) with tf.Session(graph=tf_model) as sess: graph_def = sess.graph.as_graph_def() tf.summary.image('input', x, max_outputs=7) writer = tf.summary.FileWriter("/tmp/log/") writer.add_graph(sess.graph) merged_summary = tf.summary.merge_all() predictions = [] for i, d in enumerate(data_X): s, prediction = sess.run([merged_summary, y], feed_dict={x: [d]}) writer.add_summary(s, i) predictions.append(prediction[0]) return predictions
def test_pearson_recommendation1(self): # データセットの指定 import dataset no = 2 data = dataset.dataset(no) # テストパラメータの設定 sim_func_name = "pearson" model_type_name = "normalized" cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name) # 類似度テスト user1 = "u1" user2 = "u2" # target user3 = "u3" user4 = "u4" self.assertEqual(cf_model.get_sim(user2, user1), 0.0) self.assertEqual(cf_model.get_sim(user2, user3), 1.0) self.assertEqual(cf_model.get_sim(user2, user4), -1.0) # スコアテスト user = "******" item = "i1" score = cf_model.calc_score(user, item) self.assertEqual(score, 2.75) return
def main(): """ Get the dataset, model Set the callback Train and save the best weights based on validation accuracy """ train_images, train_labels, test_images, test_labels = dataset() model = get_model() model.summary() checkpoint_path = "training/cp-{epoch:04d}.ckpt" os.path.dirname(checkpoint_path) cp_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_path, verbose=1, monitor="val_accuracy", save_best_only=True, save_weights_only=True) # Save the weights using the `checkpoint_path` format model.save_weights(checkpoint_path.format(epoch=0)) # Train the model with the new callback model.fit(train_images, train_labels, epochs=100, validation_data=(test_images, test_labels), callbacks=[cp_callback], verbose=2)
def extract_features(features_opts, dataset_opts, params): print "# Extracting image features" files1, files2 = dataset(dataset_opts) features = [] for img_file, depth_file in print_progress(files1 + files2): features.append(feature_extraction(img_file, depth_file, features_opts, params)) return files1, features[: len(features) / 2], files2, features[len(features) / 2 :]
def run(session): d = dataset.dataset(max_length=config.EXAMPLE_MAX_LEN, num_examples=config.NUM_EXAMPLES) g = graph.graph( batch_size=config.BATCH_SIZE, sequence_length=config.EXAMPLE_MAX_LEN, vocab_size=len(d.word_to_idx), num_embedding_dimensions=config.NUM_EMBEDDING_DIMENSIONS, num_lstm_layers=config.NUM_LSTM_LAYERS, num_lstm_units=config.NUM_LSTM_UNITS, start_word_idx=d.word_to_idx[dataset.start_word], stop_word_idx=d.word_to_idx[dataset.stop_word], ) session.run(tf.global_variables_initializer()) file_writer = tf.summary.FileWriter('logs/', session.graph) run_info = RunInfo(dataset=d, graph=g, session=session) for epoch_idx in range(1, config.NUM_EPOCHS): num_batches = len(d.training_idx_examples) // config.BATCH_SIZE for batch_idx in range(1, num_batches): batch_info = BatchInfo( epoch_idx=epoch_idx, batch_idx=batch_idx, ) run_batch(run_info, batch_info) run_validation(run_info, epoch_idx)
def main(): # Set random seed if given torch.manual_seed(RANDOM_SEED or torch.initial_seed()) # Define dataset trainloader, testloader = dataset(BATCH_SIZE) # Set model parameters model_params = { "batch_size": BATCH_SIZE, "layers_sizes": LAYER_SIZES, "learning_rates": LEARNING_RATES, "n_iter_1": N_ITER_1, "n_iter_2": N_ITER_2, "rho": lambda x: x.clamp(0,1), # Assuming x is a torch.Tensor "beta": BETA, "dt": DELTA, } # Define network eqprop_net = EqPropNet_NoGrad(**model_params) # Train train(eqprop_net, trainloader) # Validate test(eqprop_net, testloader)
def problem_generator_y(N, dev, mode: GenerationMode, factory: ProblemFactory, path=None): """ The function problem_generator_y is an adapted version of problem_generator which can be used as callback function while training a neural network in order to generate new training data at the beginning of each epoch. For a more detailed description, see problem_generator. """ cont = extract(mode, factory, path=path) prob_root = lin_opt_pbs(cont[0], cont[1], cont[2], cont[3], cont[4], mode=mode) prob_root.set_deviation(dev) while True: prob_root.clear_generated_RHS() sol_list = prob_root.generate_and_solve(N) rhs_list = prob_root.extract_RHS() data = dataset(rhs_list, sol_list) yield data
def test_parse_dataset_list(self): print('=== Testing parse_dataset_list() ===') det = self.create_det() dset = dataset.dataset(det) list_fname = b'test_dset_flist.txt' with open(list_fname, 'w') as f: f.writelines([ (recon_folder + b'/data/photons.emc\n').decode('utf-8'), (recon_folder + b'/data/photons.emc\n').decode('utf-8') ]) num_dsets = dset.parse_dataset_list(list_fname) self.photons_tests(dset, num_dsets) ndset = dset.next self.photons_tests(ndset, num_dsets, False) dset.parse_dataset_list(list_fname) with open(list_fname, 'w') as f: f.writelines([ (recon_folder + b'/data/photons.emc\n').decode('utf-8'), (recon_folder + b'/data/photons.h5\n').decode('utf-8') ]) num_dsets = dset.parse_dataset_list(list_fname) self.photons_tests(dset, num_dsets) ndset = dset.next self.photons_tests(ndset, num_dsets, False) os.remove(list_fname)
def test_cosine_recommendation_sim_multi_score(self): # データセットの指定 import dataset no = 1 data = dataset.dataset(no) # テストパラメータの設定 sim_func_name = "cosine" model_type_name = "sim_multi_rating" cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name) # スコアテスト user1 = "u1" user2 = "u2" user3 = "u3" user4 = "u4" user5 = "u5" item1 = "i1" item2 = "i2" item3 = "i3" item4 = "i4" self.assertEqual(cf_model.calc_score(user1, item1), 2.652365080899908) self.assertEqual(cf_model.calc_score(user1, item2), 0.3434277633975243) self.assertEqual(cf_model.calc_score(user1, item3), 2.7852678291248507) self.assertEqual(cf_model.calc_score(user1, item4), 2.2348318324104093) # U2 [('i4', 6.5480200273566895), ('i3', 5.685121675688175), ('i1', 3.2869185147104245), ('i2', 2.737414017877664)] # u3 [('i3', 7.04574847038176), ('i4', 5.517604872186145), ('i1', 2.5262421022799537), ('i2', 1.415727598843663)] # u4 [('i4', 7.945448566343385), ('i3', 7.371732349896483), ('i1', 3.142243637026426), ('i2', 1.6623943235017373)] # u5 [('i4', 7.690628217779786), ('i3', 6.50655550912908), ('i1', 3.5991851423622627), ('i2', 0.8617748202735571)] return
def test_generate_data(self): print('=== Testing generate_data() ===') det = self.create_det() dset = dataset.dataset(det) dset.generate_data(config_fname) self.photons_tests(dset) dset.generate_data(config_fname) list_fname = recon_folder + b'/test_photons_list.txt' with open(list_fname, 'w') as f: f.writelines(['data/photons.emc\n', 'data/photons.emc\n']) config = DragonflyConfig(config_fname) config.modify_entry('emc', 'in_photons_list', list_fname.decode('utf-8')) self.assertRaises(AssertionError, dset.generate_data, config_fname) config.remove_entry('emc', 'in_photons_file') dset.generate_data(config_fname) self.photons_tests(dset, 2) ndset = dset.next self.photons_tests(ndset, 2, False) with open(list_fname, 'w') as f: f.writelines(['data/photons.h5\n', 'data/photons.h5\n']) config.modify_entry('emc', 'in_photons_list', list_fname.decode('utf-8')) dset.generate_data(config_fname) self.photons_tests(dset, 2) ndset = dset.next self.photons_tests(ndset, 2, False) os.remove(list_fname) config.remove_entry('emc', 'in_photons_list') config.modify_entry('emc', 'in_photons_file', 'make_data:::out_photons_file')
def main(self): print("SmartGator Intelligent chatbot") self.root_dir = os.getcwd() self.load_config() self.load_model_params() self.load_args() self.update_settings() self.text_data = dataset(self.args) # RNN Model Initialized # self.model = RNNModel(self.text_data, self.args) # Handlers to write and save learned models # self.writer = tf.summary.FileWriter(self._get_summary_name()) self.saver = tf.train.Saver(max_to_keep=200, write_version=tf.train.SaverDef.V1) self.session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) print("Initializing tf variables") self.session.run(tf.global_variables_initializer()) # If a previous model exists load it and procedd from last run step # self.manage_previous_model(self.session) # If using word2vec model we need to laod word vectors # if self.init_embeddings: self.load_embedding(self.session) # Twitter Interface up or not # if self.twitter: return # Batch Testing # elif self.file_: try: with open(self.TEST_IN_NAME, "r") as f: try: with open(self.TEST_OUT_SUFFIX, 'w') as output: for line in f: output.write( self.predict_daemon(line[:-1]) + "\n") except: print("Writing in file is a problem") except: print("Open file error") # Else if in CLI testing mode # elif self.test: self.interactive_main(self.session) # Else in training mode # else: self.train_model(self.session) self.session.close() print("Say Bye Bye to SmartGator! ;)")
def test_free_data(self): print('=== Testing free_data() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder+b'/data/photons.emc') dset.free_data() dset.free_data() self.assertIsNone(dset.num_data)
def allocate_iterate(self): itr = iterate.iterate() det = detector.detector() dset = dataset.dataset(det) param = params.params() qmax = det.generate_detectors(config_fname) dset.generate_data(config_fname) param.generate_params(config_fname) dset.generate_blacklist(config_fname) itr.generate_iterate(config_fname, qmax, param, det, dset) return itr, det, dset, param, qmax
def test_parse_data(self): print('=== Testing parse_data() ===') det = self.create_det() dset = dataset.dataset(det) list_fname = b'test_dset_flist.txt' with open(list_fname, 'w') as f: f.writelines([(recon_folder+b'/data/photons.emc\n').decode('utf-8'), (recon_folder+b'/data/photons.emc\n').decode('utf-8')]) num_dsets = dset.parse_data(list_fname) self.photons_tests(dset, num_dsets) ndset = dset.next self.photons_tests(ndset, num_dsets, False) dset.parse_data(list_fname) os.remove(list_fname)
def test_calc_sum_fact(self): print('=== Testing calc_sum_fact() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder+b'/data/photons.emc') dset.calc_sum_fact() frame = np.zeros(dset.num_pix, dtype='i4') frame[dset.place_ones[dset.ones_accum[0]:dset.ones_accum[0]+dset.ones[0]]] = 1 frame[dset.place_multi[dset.multi_accum[0]:dset.multi_accum[0]+dset.multi[0]]] = dset.count_multi[dset.multi_accum[0]:dset.multi_accum[0]+dset.multi[0]] self.assertAlmostEqual(np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[0]) frame = np.zeros(dset.num_pix, dtype='i4') frame[dset.place_ones[dset.ones_accum[-1]:dset.ones_accum[-1]+dset.ones[-1]]] = 1 frame[dset.place_multi[dset.multi_accum[-1]:dset.multi_accum[-1]+dset.multi[-1]]] = dset.count_multi[dset.multi_accum[-1]:dset.multi_accum[-1]+dset.multi[-1]] self.assertAlmostEqual(np.log(scipy.special.factorial(frame)).sum(), dset.sum_fact[-1])
def test_make_blacklist(self): print('=== Testing make_blacklist() ===') det = self.create_det() dset = dataset.dataset(det) dset.parse_dataset(recon_folder+b'/data/photons.emc') dset.make_blacklist(b'') self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 0) dset.make_blacklist(b'', odd_flag=2) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 1500) npt.assert_array_equal(dset.blacklist[:4], [0,1,0,1]) dset.make_blacklist(b'', odd_flag=1) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 1500) npt.assert_array_equal(dset.blacklist[:4], [1,0,1,0]) blist_fname = recon_folder+b'/data/blacklist.dat' blist = np.zeros(dset.tot_num_data, dtype='u1') blist[:10] = 1 np.savetxt(blist_fname.decode('utf-8'), blist, fmt='%d') dset.make_blacklist(blist_fname) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 10) npt.assert_array_equal(dset.blacklist[8:12], [1,1,0,0]) # Behavior when both blacklist file and odd/even selection # Alternate frames which are not blacklisted by file are blacklisted dset.make_blacklist(blist_fname, odd_flag=2) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 1505) npt.assert_array_equal(dset.blacklist[8:12], [1,1,0,1]) dset.make_blacklist(blist_fname, odd_flag=1) self.assertEqual(dset.blacklist.shape[0], 3000) self.assertEqual(dset.blacklist.sum(), 1505) npt.assert_array_equal(dset.blacklist[8:12], [1,1,1,0]) os.remove(blist_fname)
def __init__(self, unlabeled_datasets = [], models = None, undersample_before_eval = False): ''' unlabeled_datasets should be either (1) a string pointing to a single data file (e.g., "mydata.txt") or (2) a list of strings pointing to multiple data files that represent the same data with different feature spaces. For more on the data format, consult the doc or see the samples. ''' if type(unlabeled_datasets) == type(""): # then a string, presumably pointing to a single data file, was passed in unlabeled_datasets = [unlabeled_datasets] self.unlabeled_datasets = unlabeled_datasets # initialize empty labeled datasets (i.e., all data is unlabeled to begin with) self.labeled_datasets = [dataset.dataset([]) for d in unlabeled_datasets] self.models = models self.undersample_first = undersample_before_eval self.query_function = self.base_q_function # throws exception if not overridden self.name = "Base" # default prediction function; only important if you're aggregating multiple feature spaces (see # cautious_predict function documentation) self.predict = self.majority_predict
def test_generate_data(self): print('=== Testing generate_data() ===') det = self.create_det() dset = dataset.dataset(det) dset.generate_data(config_fname) self.photons_tests(dset) dset.generate_data(config_fname) list_fname = recon_folder+b'/test_photons_list.txt' with open(list_fname, 'w') as f: f.writelines(['data/photons.emc\n', 'data/photons.emc\n']) config = DragonflyConfig(config_fname) config.modify_entry('emc', 'in_photons_list', list_fname.decode('utf-8')) self.assertRaises(AssertionError, dset.generate_data, config_fname) config.remove_entry('emc', 'in_photons_file') dset.generate_data(config_fname) self.photons_tests(dset, 2) ndset = dset.next self.photons_tests(ndset, 2, False) os.remove(list_fname) config.remove_entry('emc', 'in_photons_list') config.modify_entry('emc', 'in_photons_file', 'make_data:::out_photons_file')
RecIs = [item for item,score in I2Score[:N]] return RecIs if __name__ == "__main__": import sys import dataset print('***** begin *****');sys.stdout.flush() print('=== get dataset ===');sys.stdout.flush() no = 1 data = dataset.dataset(no, N=2000, M=400, K=20, R=5, seed=1) # print(data) print('=== create CF model ===');sys.stdout.flush() #sim_func_name = "pearson" sim_func_name = "cosine" #model_type_name = "normalized" model_type_name = "sim_multi_rating" cf_model = cf(data.Us, data.Is, data.U2I2Rating, sim_func_name, model_type_name) print(sim_func_name) print(model_type_name) # print("I2Us :", cf_model.I2Us) print('=== calc scores ===');sys.stdout.flush() U2I2Score = cf_model.calcU2I2Score(cf_model.Us) for user in cf_model.Us:
def run_experiments_hold_out(data_paths, outpath, hold_out_p = .25, datasets_for_eval = None, upto = None, step_size = 25, initial_size = 2, batch_size = 5, pick_balanced_initial_set = True, num_runs=10, report_results_after_runs=True): ''' This method demonstrates how to use the active learning framework, and is also a functional routine for comparing learners. Basically, a number of runs will be performed, the active learning methods will be evaluated at each step, and results will be reported. The results for each run will be dumped to a text files, which then can be combined (e.g., averaged), elsewhere, or you can use the results_reporter module to aggregate and plot the output. @parameters -- data_paths -- this is either a list (pointing to multiple feature spaces for the same instances) or a string pointing to a single data file (this will be the typical case). e.g., data_paths = "mydata.txt". curious_snake uses a sparse-formated weka-like format, documented elsewhere. outpath -- this is a directory under which all of the results will be dumped. hold_out_p -- the hold out percentage, i.e., how much of your data will be used for evaluation. you can ignore this is you're providing your own dataset(s) for evaluation (i.e., datasets_for_eval is not None)'. datasets_for_eval -- use this is you have datasets you want to use for testing -- i.e., to specify your hold out set independent of the data in data_paths. upto -- active learning will stop when upto examples have been labeled. if this is None, upto will default to the total unlabeled pool available initial_size -- the size of 'bootstrap' set to use prior to starting active learning (for the initial models) batch_size -- the number of examples to be labeled at each iteration in active learning -- optimally, 1 step_size -- results will be reported every time another step_size examples have been labeled pick_balanced_initial_set -- if True, the initial train dataset will be built over an equal number (initial_size/2) of both classes. num_runs -- this many runs will be performed report_results -- if true, the results_reporter module will be used to generate output. ''' for run in range(num_runs): print "\n********\non run %s" % run print data_paths num_labels_so_far = initial_size # set to initial size for first iteration if not os.path.isdir(outpath): os.mkdir(outpath) # if a string (pointing to a single dataset) is passed in, box it in a list data_paths = box_if_string(data_paths) datasets = [dataset.build_dataset_from_file(f) for f in data_paths] total_num_examples = len(datasets[0].instances) test_datasets = [] if datasets_for_eval is not None: # if a test set is specified, use it. datasets_for_eval = box_if_string(datasets_for_eval) test_datasets = [dataset.build_dataset_from_file(f) for f in datasets_for_eval] if upto is None: upto = total_num_examples else: # other wise, we copy the first (even if there multiple datasets, it won't matter, as we're just using # the labels) and pick random examples hold_out_size = int(hold_out_p * total_num_examples) test_instances = random.sample(datasets[0].instances, hold_out_size) test_instance_ids = [inst.id for inst in test_instances] # now remove them from the dataset(s) for d in datasets: cur_test_dataset = dataset.dataset(d.remove_instances(test_instance_ids)) test_datasets.append(cur_test_dataset) # if no upper bound was passed in, use the whole pool U if upto is None: upto = total_num_examples - hold_out_size print "using %s out of %s instances for test set" % (hold_out_size, total_num_examples) print "U has cardinality: %s" % datasets[0].size() # # Here is where learners can be added for comparison # learners = [random_learner.RandomLearner([d.copy() for d in datasets]), simple_learner.SimpleLearner([d.copy() for d in datasets]), nb_learner.NBLearner([d.copy() for d in datasets])] output_files = [open("%s//%s_%s.txt" % (outpath, learner.name, run), 'w') for learner in learners] # we arbitrarily pick the initial ids from the first learner; this doesn't matter, as we just use the instance ids initial_f = learners[0].get_random_unlabeled_ids init_size = num_labels_so_far if pick_balanced_initial_set: initial_f = learners[0].pick_balanced_initial_training_set init_size = int(num_labels_so_far/2.0) # equal number from both classes # Again, you could call *.initial_f on any learner -- it just returns the ids to label initially. these will # be the same for all learners. init_ids =initial_f(init_size) # label instances and build initial models for learner in learners: learner.label_instances_in_all_datasets(init_ids) learner.rebuild_models() # report initial results, to console and file. report_results(learners, test_datasets, num_labels_so_far, output_files) first_iter = True while num_labels_so_far <= upto - step_size: # # the main active learning loop # cur_step_size = step_size cur_batch_size = batch_size if first_iter: # here we account for the initial labeled dataset size. for example, suppose # the step_size is set to 25 (we want to report results every 25 labels), # but the initial size was 2; then we want to label 23 on the first iteration # so that we report results when 25 total labels have been provided cur_step_size = step_size - num_labels_so_far if num_labels_so_far <= step_size \ else step_size - (num_labels_so_far - step_size) # in general, step_size is assumed to be a multiple of batch_size, for the first iteration, # when we're catching up to to the step_size (as outlined above), we set the # batch_size to 1 to make sure this condition holds. cur_batch_size = 1 first_iter = False for learner in learners: learner.active_learn(cur_step_size, num_to_label_at_each_iteration = cur_batch_size) num_labels_so_far += cur_step_size print "\n***labeled %s examples out of %s so far***" % (num_labels_so_far, upto) report_results(learners, test_datasets, num_labels_so_far, output_files) # close files for output_file in output_files: output_file.close() # post-experimental reporting if report_results_after_runs: results_reporter.post_runs_report(outpath, [l.name for l in learners], num_runs)
dset the dataset object output: P [1d ndarray] probability P(L|Ix) ''' #print("test>>mnode:{}".format(self)) if self.tau is None:#reaching terminal node return self.P else: #if (self.L is not None and goLeft) : if (dset.getI(self.theta,x)<self.tau) : return self.L.getP(x,dset) else: return self.R.getP(x,dset) def getL(self,x,dset): ''' input: x sample index [int] dset the dataset object output: L [integer] label ''' return np.argmax(self.getP(x,dset)) if __name__ == '__main__': from dataset import dataset dset=dataset() print dset root=mnode()
import tensorflow as tf import dataset data = dataset.dataset() data.load_from_json("train.json", 'cuisine', 'ingredients') input_size = len(data.vocabulary) categories = len(data.labels) def weight_variable(shape): initial = tf.truncated_normal(shape, stddev = 0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape = shape) return tf.Variable(initial) def relu_activation(W, x): return tf.relu(tf.matmul(x,W) + b) def softmax(W, x): return tf.nn.softmax() #placeholders tf_x = tf.placeholder(tf.float32, shape=[None, input_size]) y_expected = tf.placeholder(tf.float32, shape=[None, categories]) #first hidden layer params. #size is [lower_layer_size, upper_layer_size] W_h1 = weight_variable([input_size, 1000])
import theano import theano.tensor as T import numpy as np import time as ti import dataset as d import markhov as m from theano.tensor.nnet.conv import conv2d from theano.tensor.signal.downsample import max_pool_2d from sklearn import metrics from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams() import pandas as pd from PIL import Image ds=d.dataset(test_size=0.15) ma=m.markhov() x_gold, labels_gold = ds.test_batch(size=128,emit=False) # define symbolic Theano variables x = T.tensor4() t = T.matrix() # define model: neural network def floatX(x): return np.asarray(x, dtype=theano.config.floatX) def init_weights(shape): return theano.shared(floatX(np.random.randn(*shape) * 0.1)) def meanfscore(y_pred,y_true): return metrics.f1_score(np.array(np.rint(y_true),dtype="int"), np.array(np.rint(y_pred),dtype="int") , average='samples')
import uninet as ssb import neuralnetwork_tensorflow as nntf import cPickle import activationFunction as af import dataset old_trainingSet = cPickle.load(open("/media/tassadar/Work/Google Drive/My/NeuralNet/data/mnist/MNISTTrainingSet_square", 'rb')) #cvSet = cPickle.load(open("/media/tassadar/Work/Google Drive/My/NeuralNet/data/mnist/MNISTTestSet_square", 'rb')) trainingSet = dataset.dataset(examples=old_trainingSet.examples, labels=old_trainingSet.labels) trainingSet.rearrangeToCubic() l1 = ssb.input(input_size=[28,28,1]) l2 = ssb.convolutional(patchSize=3, strides=[1,1,1,1], depth=16, activationFunction=af.relu) l3 = ssb.softmax(neurons=10) net = nntf.neuralnetwork(layers=[l1,l2,l3], errorFunction=ssb.logLoss) net.train(trainingSet=trainingSet, numEpochs=10000, minibatchSize=100, learningRate=0.1, errorCheckPeriod=100)
from dataset import dataset import numpy as np import matplotlib.pyplot as plt import sys ds_name = sys.argv[1] dir_name = "datasets/%s"%(ds_name) dataset_ts = dataset("%s_ts.txt"%(ds_name)) u2i = dataset_ts.user_item_matrix['binary'] i2u = dataset_ts.item_user_matrix['binary'] users = sorted(u2i.keys()) items = sorted(i2u.keys()) f = open("%s/user_history_lengths.txt"%(dir_name), 'w') for user in users: f.write("%d %d\n" % (user, len(u2i[user]))) f.close() f = open("%s/item_listening_lengths.txt"%(dir_name), 'w') for item in items: f.write("%d %d\n" % (item, len(i2u[item]))) f.close()
def main(_): train_set, test_set, idx2word, word2idx = dataset.dataset() train(train_set, test_set, idx2word, word2idx)
data = {} for header, column in header_column.iteritems(): data[header] = None if column > len(row) else row[column-1] return data asr_id_rows = {} for idx, r in enumerate(values[1:]): row_number = idx + 2 row = data(r) asr_id_rows[row['ASR_ID']] = row # creates a CSV file with all pledges in the neighborhood boundary. with dataset() as ds: addressLayer = ds.layers['address'] neighborhoodLayer = ds.layers['neighborhood'] for neighborhood in neighborhoodLayer: neighborhoodName = neighborhood.GetField("name") print neighborhoodName targetDir = os.path.join(r'C:\personal\BeeSafeBoulder\GoogleDrive-BeeSafe\BeeSafe', neighborhoodName) if not os.path.exists(targetDir): os.mkdir(targetDir) addressLayer.SetSpatialFilter(neighborhood.GetGeometryRef()) addressLayer.SetAttributeFilter('ADDR_FMT = "EXACT"')
r.data[x] = float(r.data[x]) else: r.data[x] = -1 # 构造跟节点,构造集合S,S初始化包含根节点 # 当根节点非空时,执行操作 # 取出Sets中的一个节点R,如果 # R为单一属性的记录 # -->得到叶子节点 # 否则 # 根据最好属性进行分类 # 将得到的新的子节点放入到S中,从S中移除R # 构造对应的树结构 root_node = dataset(test_datas, []) unpure_list = [root_node] records = test_datas # attr = attr_list[4] # print attr # print 'split_by_attr(test_datas, attr): ', split_by_attr(test_datas, attr) # print 'gain_ratio(records, attr) : ', gain_ratio(records, attr) # print 'split_by_attr(records, attr_list[0]): ', split_by_attr(records, attr_list[0]) print records[0] [gains, attr_best] = [-1, None] for attr in attr_list[0:-1]: if gains<gain_ratio(records, attr): [gains, attr_best] = [gain_ratio(records, attr), attr] print gains, attr_best
def main(): ds = dataset() ds.model_background_state() ds.model_fires(30) ds.model_cloud_cover() ds._save_to_gif()
# https://github.com/tensorflow/tensorflow/issues/1541 import scipy.misc from keras.utils import np_utils import dataset import net np.random.seed(1337) n = 224 batch_size = 128 data_directory, = sys.argv[1:] X, y, tags = dataset.dataset(data_directory, n) nb_classes = len(tags) sample_count = len(y) train_size = sample_count * 4 // 5 X_train = X[:train_size] y_train = y[:train_size] Y_train = np_utils.to_categorical(y_train, nb_classes) X_test = X[train_size:] y_test = y[train_size:] Y_test = np_utils.to_categorical(y_test, nb_classes) def evaluate(model, vis_filename=None): Y_pred = model.predict(X_test, batch_size=batch_size) y_pred = np.argmax(Y_pred, axis=1)
parser.add_argument("--log", type=str, help="log file", default="rescal.log", required=False) parser.add_argument("--result", type=str, help="result file", default="result.txt", required=False) return parser.parse_args() if __name__ == '__main__': # Parsing arguments start_time = datetime.now() cliInputArgs = parseArguments() # Log file logFile = "./log/" + cliInputArgs.log # Result file resultToFile = open("./result/" + cliInputArgs.result, 'w') logger = Logger() runRescal = RunRescal(dataset(cliInputArgs.train, "UTF-8"), dataset(cliInputArgs.test, "UTF-8"), logger) config = {'numLatentComponents':cliInputArgs.latent, 'regularizationParam':cliInputArgs.lmbda, 'th':cliInputArgs.th} runRescal.rescal(config) # Start training result = rescal(runRescal.X, config['numLatentComponents'], lmbda=config['regularizationParam']) logger.getLog().info('[Tensor] Objective function value: %.3f' % result[2]) logger.getLog().info('[Tensor] Of iterations: %d' % result[3]) A = result[0] R = result[1] logger.getLog().info("[Tensor] Matrix A's shape: %s" % str(A.shape)) logger.getLog().info("[Tensor] Matrix R's shape: %s" % str(R[0].shape)) # _log.info("## Execute time: %s" % str(sum(result[4]))) # Evaluate algorithm performance resultToFile.write("-" * 20 + "\n")
if sample[0] == sample[1]: count += 1 self.logger.getLog().info("Çount: %d, Evaluation length: %d" % (count, len(evaluation))) # print "Count:", count, " Evaluation: ", len(evaluation) return totalScore * 1.0 / len(allRel), totalAccuracy * 1.0 / len(allRel) if __name__ == '__main__': start_time = datetime.now() dataArgs, algoArgs = parseArguments() logFile = "./log/" + dataArgs['log'] resultToFile = open("./result/" + dataArgs['result'], 'w') # 初始化logger和算法实例 logger = Logger() runRescal = RunRescal(dataset(dataArgs['train'], "UTF-8"), dataset(dataArgs['test'], "UTF-8"), logger) # 运行RESCAL和Tranlating Embedding算法 runRescal.rescal(algoArgs, False) runRescal.RunTransE("myData/FOAF/ent2id_c.txt", "myData/FOAF/rel2id_c.txt", "myData/FOAF/entity2vec.foaf.bern", "myData/FOAF/relation2vec.foaf.bern") # runRescal.tranE(loadPickle(dataArgs['embed'])) # t1, t2 = runRescal.training(0.001) # for t in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]: # for t in [0, ]: # runRescal.calEveryRelationScore(t, 1-t) # testCase = runRescal.pickPredictedResult() # roc, acc = runRescal.roc([(i[-1], i[-3]) for i in testCase]) # # for t in testCase: # # print t[0], t[1], t[2], t[3] # print "t1: %f, t2 %f, ROC: %f, ACC: %f" % (t, 1-t, roc, acc) # end_time = datetime.now()