def error_nn_classifier(size_data, batch_size, n_data_sets): """ Compute out of sample error of nearest neighbor classifier :param size_data: int Size of the training set :param batch_size: int Size of the test set :param n_data_sets: int Number of data sets to be evaluated :return: array-like of shape (n_data_sets,) Out of sample error for different training sets """ test_set = create_data(batch_size) oses = np.empty(n_data_sets) for i in range(n_data_sets): data = create_data(size_data) prediction = nn_classifier(data, test_set) n_errors = np.sum(np.abs(np.subtract(prediction, test_set[:, 1]))) ose = 100 * n_errors / batch_size oses[i] = ose return oses
def main(): assert len(sys.argv) == 5, COMMAND num_titles = int(sys.argv[1]) filename = sys.argv[2] top5000_filename = sys.argv[3] titles_filename = sys.argv[4] create_data(num_titles, filename, top5000_filename, titles_filename)
def plot_attAUC(GT, attributepattern, clf): AUC = [] P = np.loadtxt(attributepattern) attributes = get_attributes() # Loading ground truth test_index = bzUnpickle('./CreatedData/test_features_index.txt') test_attributes = get_class_attributes('./Classes/', name='test') _, y_true = create_data('./CreatedData/test_featuresVGG19.pic.bz2', test_index, test_attributes) for i in range(y_true.shape[1]): fp, tp, _ = roc_curve(y_true[:, i], P[:, i]) roc_auc = auc(fp, tp) AUC.append(roc_auc) print("Mean attrAUC %g" % (np.nanmean(AUC))) xs = np.arange(y_true.shape[1]) width = 0.5 fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1, 1, 1) rects = ax.bar(xs, AUC, width, align='center') ax.set_xticks(xs) ax.set_xticklabels(attributes, rotation=90) ax.set_ylabel("area under ROC curve") autolabel(rects, ax) plt.savefig('results/AwA-AttAUC-DAP-%s.pdf' % clf)
def main(): # Create Result Directory os.makedirs('./results/predict', exist_ok=True) # Get Arguments args = args_initialize() # Define Model net_G = ResNetGenerator( input_nc=args.input_nc, output_nc=args.output_nc, ngf=args.ngf, n_blocks=9 ) # Load Weights state_dict = torch.load('./latest_net_G.pth', map_location='cpu') net_G.load_state_dict(state_dict) # Create Tensor from Image file im_file = args.imfile tensor_img = utils.create_data(im_file) # Predict outputs = net_G.forward(tensor_img)[0] # Convert Output Tensor to Image file im = utils.tensor2im(outputs) file_name = os.path.basename(im_file) save_path = os.path.join('./results/predict', 'horse2zebra_' + str(file_name) + '.png') utils.save_image(im, save_path)
def load_data(base_path="../data"): """ Load the data in PyTorch Tensor. :return: (zero_train_matrix, train_data, valid_data, test_data) WHERE: zero_train_matrix: 2D sparse matrix where missing entries are filled with 0. train_data: 2D sparse matrix valid_data: A dictionary {user_id: list, user_id: list, is_correct: list} test_data: A dictionary {user_id: list, user_id: list, is_correct: list} """ new_train_matrix, new_sparse_train_matrix, valid_data, test_data = create_data(base_path) # print(new_train_matrix) # print(new_sparse_train_matrix) # train_matrix = load_train_sparse(base_path).toarray() # valid_data = load_valid_csv(base_path) # test_data = load_public_test_csv(base_path) # print(len(valid_data["user_id"])) # print(len(test_data["user_id"])) # zero_train_matrix = train_matrix.copy() zero_train_matrix = new_sparse_train_matrix.copy() # Fill in the missing entries to 0. # zero_train_matrix[np.isnan(train_matrix)] = 0 zero_train_matrix[np.isnan(new_sparse_train_matrix)] = 0 # Change to Float Tensor for PyTorch. zero_train_matrix = torch.FloatTensor(zero_train_matrix) # train_matrix = torch.FloatTensor(train_matrix) train_matrix = torch.FloatTensor(new_sparse_train_matrix) return zero_train_matrix, train_matrix, valid_data, test_data
def Predict_Mode(): print ("\n","Test Mode select ... ","\n") id2tag = dict((y,x) for x,y in r_dic.items()) #인덱스를 key로, 관계명을 value인 dictionary. with open("./Pickles/word2idx.pkl","rb") as fout: #학습에 사용된 동일한 word2id loading. word2idx = pickle.load(fout) Model = torch.load("BestModel.pt") #best model reload. Pre_data = data_loading(opts.predict,p_dic,"+") #predict할 파일 loading sentence = [] for line in open(opts.predict): line = line.strip() if len(line) == 0: continue if line[0] == ";": sentence.append(line) out = open("../output/result.txt","w") #의존관계 결과를 저장할 파일. Pre = create_data(Pre_data,word2idx,r_dic) Model.train(False) for i in range(len(Pre_data)): out.write(sentence[i]+"\n") points,labels = predict(Model,Pre[i]) #한 문장마다 best model로 예측 수행. points[-1] = -1 for j in range(len(Pre_data[i][:-1])): input = Pre_data[i][j] out.write(str(input["current_idx"]+1)+"\t"+\ str(points[j]+1)+"\t"+\ id2tag[labels[j]]+"\t"+\ input["pure_morphemes"]+"\n") out.write("\n") print ("predict success ... ")
def error_threshold_classifier(type, analytical=False, batch=None, n_data_sets=None, threshold=None): """ Calculate out-of-sample error (generalization error) given number of data sets :param type: string Classifier type :param analytical: Bool If True, calculate analytical error of the classifier; otherwise empirical :param batch: int Size of the test set :param n_data_sets: int Number of data sets to be evaluated :param threshold: int Threshold of the classifier :return: If analytical is False, array-like of shape (n_data_sets,) -- Out of sample error for different training sets Otherwise, int -- Analytical error """ if analytical is False: oses = np.empty(n_data_sets) for i in range(n_data_sets): test_set = create_data(batch) prediction = threshold_classifier(type=type, X=test_set[:, 0], threshold=threshold) n_errors = np.sum(np.abs(np.subtract(prediction, test_set[:, 1]))) ose = n_errors * 100 / batch oses[i] = ose return oses else: a_error = threshold_classifier(type=type, threshold=threshold, error=True) return a_error
def main(): start = time.time() args = parser.parse_args() model = import_module(args.model_path) if (args.create_data): utils.create_data(model, args.number_of_core_samples, args.step_size, args.name, args.output_path) if (args.create_unstructured_data): model.create_unstructured_data(model, args.number_of_core_samples, args.name, args.output_path) if (args.rank_global): utils.rank_global(model, args.number_of_core_samples, args.step_size, args.name, args.plot, args.output_path) if (args.rank_local): print(utils.rank_local(model, args.number_of_core_samples, args.step_size, args.name, args.threshold, args.plot, args.output_path)) if (args.measure_global_accuracy): print(utils.measure_global_accuracy(model, args.number_of_core_samples, args.step_size, args.name, args.output_path)) if (args.measure_local_accuracy): print(utils.measure_local_accuracy(model, args.number_of_core_samples, args.step_size, args.name, args.output_path)) print('this took {} seconds'.format(time.time() - start))
def get_data(self): ds = utils.create_data() mids = np.linspace(-7. / 8. * np.pi, np.pi, 16).astype(np.float32) data = mids[np.random.randint(mids.size, size=ds['x_wind'].size)] ds['albatros_flight_direction'] = (('time', 'longitude', 'latitude'), data.reshape(ds['x_wind'].shape), {'units': 'radians'}) return ds[['albatros_flight_direction']]
def test_small_time(self): ds = create_data() sm_time = tinylib.small_time(ds['time']) num_times, units = tinylib.expand_small_time(sm_time['packed_array']) actual = xray.Dataset({'time': ('time', num_times, {'units': units})}) actual = xray.decode_cf(actual) self.assertTrue(np.all(actual['time'].values == ds['time'].values)) self.assertTrue(units == ds['time'].encoding['units'])
def test_version_assert(self): # create a forecast that looks like its from a newer version # and make sure an assertion is raised. ds = create_data() original_version = compress._VERSION compress._VERSION = np.array(compress._VERSION + 1, dtype=np.uint8) beaufort = compress.compress_dataset(ds) compress._VERSION = original_version self.assertRaises(ValueError, lambda: compress.decompress_dataset(beaufort))
def get_data(self): ds = utils.create_data() bins = self.get_scheme().bins mids = 0.5 * (bins[1:] + bins[:-1]) data = mids[np.random.randint(mids.size, size=ds['x_wind'].size)] ds['great_white_shark_length'] = (('time', 'longitude', 'latitude'), data.reshape(ds['x_wind'].shape), {'units': 'm'}) return ds[['great_white_shark_length']]
def indirectAttributePrediction(classifier='SVM'): # Get features index to recover samples train_index = bzUnpickle('./CreatedData/train_features_index.txt') test_index = bzUnpickle('./CreatedData/test_features_index.txt') # Get classes-attributes relationship train_attributes = get_class_attributes('./', name='train') test_attributes = get_class_attributes('./', name='test') # Create training Dataset print('Creating training dataset...') X_train, a_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2', train_index, train_attributes) y_train = [] for (animal, num) in train_index: y_train += num * [animal_dict[animal]] y_train = np.array(y_train) print('X_train to dense...') X_train = X_train.toarray() print('Creating test dataset...') X_test, a_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2', test_index, test_attributes) print('X_test to dense...') X_test = X_test.toarray() clf = SVMClassifierIAP(n_components=100, C=1.0) print('Training model... (takes around 10 min)') t0 = time() clf.fit(X_train, y_train) print('Training finished in', time() - t0) y_pred = clf.predict(X_test) y_proba = clf.predict_proba(X_test) print('Saving files...') np.savetxt('./IAP/prediction_SVM', y_pred) np.savetxt('./IAP/probabilities_SVM', y_proba)
def test_subset_spot_dataset(self): fcst = utils.create_data() times, units, cal = xray.conventions.encode_cf_datetime(fcst['time']) assert 'hours' in units def test_one_query(lon_slice, lat_slice, hour_slice): lon = np.mean(fcst['longitude'].values[lon_slice]) lat = np.mean(fcst['latitude'].values[lat_slice]) hours = times[hour_slice] query = {'location': {'latitude': lat, 'longitude': lon}, 'model': 'gefs', 'type': 'spot', 'hours': hours, 'variables': ['wind']} ss = subset.subset_spot_dataset(fcst, query) assert fcst['x_wind'].dims == ('time', 'longitude', 'latitude') expected = np.mean(np.mean(fcst['x_wind'].values[hour_slice, lon_slice, lat_slice], axis=2), axis=1) np.testing.assert_array_almost_equal(ss['x_wind'].values.reshape(-1), expected) assert fcst['y_wind'].dims == ('time', 'longitude', 'latitude') expected = np.mean(np.mean(fcst['y_wind'].values[hour_slice, lon_slice, lat_slice], axis=2), axis=1) np.testing.assert_array_almost_equal(ss['y_wind'].values.reshape(-1), expected) np.testing.assert_array_equal(ss['latitude'].values, lat) np.testing.assert_array_equal(ss['longitude'].values, lon) # test a query with lat/lon in the middle of a grid. test_one_query(slice(0, 2), slice(0, 2), slice(0, None, 3)) # and with the lat/lon exactly on a grid test_one_query(slice(1, 2), slice(1, 2), slice(1, None, 3))
def test_subset(self): fcst = utils.create_data() query = {'hours': np.array([0., 2., 4., 6.]), 'domain': {'N': np.max(fcst['latitude'].values) - 1, 'S': np.min(fcst['latitude'].values) + 1, 'E': np.max(fcst['longitude'].values) - 1, 'W': np.min(fcst['longitude'].values) + 1}, 'grid_delta': (1., 1.), 'variables': ['wind']} ss = subset.subset_dataset(fcst, query) np.testing.assert_array_equal(ss['longitude'].values, np.arange(query['domain']['W'], query['domain']['E'] + 1)) np.testing.assert_array_equal(np.sort(ss['latitude'].values), np.arange(query['domain']['S'], query['domain']['N'] + 1))
def test_compress_dataset(self): ds = create_data() compressed = compress.compress_dataset(ds) actual = compress.decompress_dataset(compressed) np.testing.assert_allclose(actual['x_wind'].values, ds['x_wind'].values, atol=1e-4, rtol=1e-4) np.testing.assert_allclose(actual['y_wind'].values, ds['y_wind'].values, atol=1e-4, rtol=1e-4) np.testing.assert_allclose(actual['air_pressure_at_sea_level'].values, ds['air_pressure_at_sea_level'].values, atol=1e-4, rtol=1e-4) # pass it through the system again, it should be idempotent. compressed = compress.compress_dataset(ds) actual = compress.decompress_dataset(compressed) np.testing.assert_allclose(actual['x_wind'].values, ds['x_wind'].values, atol=1e-4, rtol=1e-4) np.testing.assert_allclose(actual['y_wind'].values, ds['y_wind'].values, atol=1e-4, rtol=1e-4) np.testing.assert_allclose(actual['air_pressure_at_sea_level'].values, ds['air_pressure_at_sea_level'].values, atol=1e-4, rtol=1e-4)
def testAll(): results_dir = "results/" filename = "static_arithmetic_test.txt" for _op, op_func in operations.items(): x, y, x_test, y_test = create_data(50000, 100, 0, 1000, 1000, 10000, op_func) print("In operation {}".format(_op)) print("NAC") model = NAC(100, 2, 1) nac_err = model.train(x, y, x_test, y_test) tf.reset_default_graph() counter = 0 nalu_err = np.nan while (np.isnan(nalu_err) and counter < 10): # NALU can often become NaN counter += 1 print("NALU") model = NALU(100, 2, 1) nalu_err = model.train(x, y, x_test, y_test) tf.reset_default_graph() print("MLP") model = MLP(100, 2, 1) random_err, _ = model.validate(x_test, y_test) mlp_err = model.train(x, y, x_test, y_test) tf.reset_default_graph() max_score = np.nanmax([nac_err, nalu_err, random_err, mlp_err]) with open(results_dir + filename, "a") as f: f.write("\n{}\n".format(_op)) f.write("NAC err: {} | {}\n".format(nac_err, nac_err / max_score)) f.write("NALU err: {} | {}\n".format(nalu_err, nalu_err / max_score)) f.write("MLP err: {} | {}\n".format(mlp_err, mlp_err / max_score)) f.write("Random err: {} | {}\n".format(random_err, random_err / max_score))
def test_forecast_containing_point(self): fcst = utils.create_data() lat = np.random.uniform(np.min(fcst['latitude'].values), np.max(fcst['latitude'].values)) lon = np.random.uniform(np.min(fcst['longitude'].values), np.max(fcst['longitude'].values)) query = {'location': {'latitude': lat, 'longitude': lon}, 'model': 'gfs', 'type': 'spot', 'hours': np.linspace(0, 9, 3).astype('int'), 'variables': ['wind'], 'warnings': []} modified_query = subset.query_containing_point(query) ss = subset.subset_gridded_dataset(fcst, modified_query) self.assertTrue(np.any(lon >= ss['longitude'].values)) self.assertTrue(np.any(lon <= ss['longitude'].values)) self.assertTrue(np.any(lat >= ss['latitude'].values)) self.assertTrue(np.any(lat <= ss['latitude'].values)) # we should be able to pass the results through again and get the same thing. subset2 = subset.subset_gridded_dataset(ss, modified_query) self.assertTrue(subset2.equals(ss))
word_dir = args.word_dir vector_dir = args.vector_dir train_dir = args.train_dir dev_dir = args.dev_dir test_dir = args.test_dir num_lstm_layer = int(args.num_lstm_layer) num_hidden_node = int(args.num_hidden_node) dropout = float(args.dropout) batch_size = int(args.batch_size) patience = int(args.patience) startTime = datetime.now() print 'Loading data...' input_train, output_train, input_dev, output_dev, input_test, output_test, alphabet_tag, max_length = \ utils.create_data(word_dir, vector_dir, train_dir, dev_dir, test_dir) print 'Building model...' time_step, input_length = np.shape(input_train)[1:] output_length = np.shape(output_train)[2] ner_model = network.building_ner(num_lstm_layer, num_hidden_node, dropout, time_step, input_length, output_length) print 'Model summary...' print ner_model.summary() print 'Training model...' early_stopping = EarlyStopping(patience=patience) history = ner_model.fit(input_train, output_train, batch_size=batch_size, epochs=1000, validation_data=(input_dev, output_dev), callbacks=[early_stopping])
# create_data.py - create and process input data into outputted JSON lists. from utils import create_data if __name__ == '__main__': create_data( train_folders=['../input_data/train2014', '../input_data/val2014'], test_folders=[ '../input_data/BSD100/image_SRF_4', '../input_data/Set5/image_SRF_4', '../input_data/Set14/image_SRF_4' ], min_size=100, output_folder='../output_lists') # create_data(train_folders=['dataset/BSDS300/images/train'], # test_folders=['dataset/BSDS300/images/test'], # min_size=100, # output_folder='../output_lists')
def get_data(self): scheme = self.get_scheme() ds = utils.create_data() for x in scheme.variables: ds = utils.add_tiny_variable(x, ds) return ds
else: qid2title_add = _pickle.load(open(qid2title_add_fi, "rb")) # Merge EN and HR&LR worlds qid2title.update(qid2title_add) # DATA if weight_hr == -1: #Zero-Shot data_tr_sampled = "data/{}/mentions_tr_ZS_hr={}_size={}.txt".format( lang, hr_lang, num_data) else: data_tr_sampled = "data/{}/mentions_tr_hr={}_size={}.txt".format( lang, hr_lang, num_data) if not os.path.exists(data_tr_sampled): create_data(path_tr, path_tr_hr, data_tr_sampled, num_data, qid2title, weight_hr) # TOKENIZER if weight_hr == -1: #Zero-Shot path = "data/{}/charagram_ZS_hr={}_vocabulary_size={}.pkl".format( lang, hr_lang, num_data) else: path = "data/{}/charagram_hr={}_vocabulary_size={}.pkl".format( lang, hr_lang, num_data) if os.path.exists(path): tokenizer = tok_ngram(data_tr_sampled, path) tokenizer.load() else: tokenizer = tok_ngram(data_tr_sampled, path) tokenizer.train()
lf_3d_T = lambda x: np.atleast_2d(lf_3d(x)).T hf_3d_T = lambda x: np.atleast_2d(hf_3d(x)).T def create_mfgp_obj(dim, lf, hf, X_hf): # model = models.GPDF(dim, 0.001, 2, hf, lf) model = models.NARGP(dim, hf, lf, add_noise=True) model.fit(X_hf) return model if __name__ == '__main__': dim = 3 X_lf, Y_lf, X_hf, Y_hf, X_test = utils.create_data(lf_3d, hf_3d, dim) Y_test = hf_3d_T(X_test) mfgp_obj = utils.create_mfgp_obj(dim, lf_3d_T, hf_3d_T, X_hf, method='GPDF', add_noise=True) actual_mean, actual_variance = utils.analytical_mean( a, constant=5), utils.analytical_var(a) distribution = cp.J(cp.Uniform(0, 1), cp.Uniform(0, 1), cp.Uniform(0, 1)) temp_f = lambda x: mfgp_obj.predict(x)[0] cp_wrapper = cpw.ChaospyWrapper(temp_f, distribution, polynomial_order=10, quadrature_order=10)
pattern_tag_list = [] words_to_ignore = ['?', '.', '!'] inputs = [] targets = [] # Loading data from json intents = load_data('Data/intents.json') # Preparing data by tokenizing and stemming prepare_data = prepare_data(intents, tags, words, pattern_tag_list, words_to_ignore) words, tags = prepare_data # Creating data for training create_data = create_data(inputs, targets, pattern_tag_list, tags, words) inputs, targets = create_data # Training data dataset = ChatbotDataset(inputs, targets) train_loader = DataLoader(dataset=dataset, batch_size=8, shuffle=True) input_size = len(inputs[0]) hidden_size = len(tags) output_size = len(tags) model = NeuralNetwork(input_size, hidden_size, output_size) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Training phase
import numpy as np import utils if __name__ == '__main__': X_norm, _ = utils.create_data() A = utils.create_affinity_matrix(X_norm) D = np.diag(np.sum(A, axis=1)) L = D - A eigvals, eigvecs = np.linalg.eig(L) n_dim = eigvecs.shape[0] p = np.zeros(n_dim) p[eigvecs[:, 1] > 0] = 1.0 utils.show_result(X_norm, p)
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparation # ========================================================== print("Loading Data...") process_data("data/processed/sst1.p") x = _pickle.load(open("data/processed/sst1.p", "rb")) revs, embedding, W2, word_idx_map, vocab, max_length = x[0], x[1], x[2], x[ 3], x[4], x[5] x_train, y_train, x_dev, y_dev = create_data(revs, word_idx_map, max_length, FLAGS.num_classes) # Training # ========================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Code that operates on the default graph and session comes here if FLAGS.random: embedding = W2 cnn = TextCNN(seq_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab) + 1,
def DirectAttributePrediction(predicate_type='binary'): # Get features index to recover samples train_index = bzUnpickle('./CreatedData/train_features_index.txt') test_index = bzUnpickle('./CreatedData/test_features_index.txt') val_index = bzUnpickle('./CreatedData/validation_features_index.txt') # Get classes-attributes relationship train_attributes = get_class_attributes('./Classes/', name='train', predicate_type=predicate_type) test_attributes = get_class_attributes('./Classes/', name='test', predicate_type=predicate_type) N_ATTRIBUTES = train_attributes.shape[1] # Create training Dataset print('Creating training dataset...') X_train, y_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2', train_index, train_attributes) print('Creating seen test dataset...') X_test_seen, y_test_seen = create_data( './CreatedData/validation_featuresVGG19.pic.bz2', val_index, train_attributes) y_pred_ = np.zeros(y_test_seen.shape) y_proba_ = np.copy(y_pred_) print('X_train to dense...') X_train = X_train.toarray() print('X_test_seen to dense...') X_test_seen = X_test_seen.toarray() print('Creating test dataset...') X_test, y_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2', test_index, test_attributes) y_pred = np.zeros(y_test.shape) y_proba = np.copy(y_pred) print('X_test to dense...') X_test = X_test.toarray() if predicate_type != 'binary': clf = NeuralNetworkRegressor(dim_features=X_train.shape[1], nb_attributes=N_ATTRIBUTES) else: clf = NeuralNetworkClassifier(dim_features=X_train.shape[1], nb_attributes=N_ATTRIBUTES) print('Fitting Neural Network...') # fix random seed for reproducibility # seed = 7 # numpy.random.seed(seed) # X_train_, X_test_, y_train_, y_test_ = train_test_split(X_train, y_train, test_size=1, random_state=seed) his = clf.fit(X_train, y_train) print('Predicting attributes...') y_pred = np.array(clf.predict(X_test)) y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[1])).T y_proba = y_pred y_pred_ = np.array(clf.predict(X_test_seen)) y_pred_ = y_pred_.reshape((y_pred_.shape[0], y_pred_.shape[1])).T y_proba_ = y_pred_ print('Saving files...') np.savetxt('./DAP_' + predicate_type + '/prediction_NN', y_pred) np.savetxt('./DAP_' + predicate_type + '/xprediction_NN', y_pred_) if predicate_type == 'binary': np.savetxt('./DAP_' + predicate_type + '/probabilities_NN', y_proba) np.savetxt('./DAP_' + predicate_type + '/xprobabilities_NN', y_proba_)
def Train_Mode(): print ("\n","Train Mode select ... ","\n") assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) try : #pre_emb 파일이 있으면 embedding file을 loading. assert os.path.isfile(opts.pre_emb) with open(opts.pre_emb,"rb") as fout: embedding = pickle.load(fout) print ("vocab loading success ... ") except : #pre_emb 파일이 없으면 random embedding으로 학습 수행. print ("embedding is random...") embeddings = {} Tr_data = [] Te_data = [] Dev_data = [] Tr_data = data_loading(opts.train,p_dic,"+") #학습 데이터 loading. Dev_data = data_loading(opts.dev,p_dic,"+") #평가 데이터 loading. Te_data = data_loading(opts.test,p_dic,"+") #개발 데이터 loading. print ("data_loading success ...") word_dim = parameters["word_dim"] #word embedding의 dimensions word2idx = create_word2idx(Tr_data,p_dic) # key:word, value:index의 형태로 dictionary 생성. print ("word2idx create success ...") #embedding layer 초기 weights는 random 사용. matrix = np.random.uniform(-np.sqrt(1.0),np.sqrt(1.0),(len(word2idx),word_dim)) for w in word2idx: if w in embeddings: matrix[word2idx[w]] = embeddings[w] #word2id에 있는 word가 pretrained된 embedding이 있으면 matrix 변환. print ("word matrix create success ...") with open("./Pickles/word2idx.pkl","wb") as fout: #word2id를 pickle타입으로 저장. pickle.dump(word2idx,fout) print ("Pickle file save success ...") Tr = create_data(Tr_data,word2idx,r_dic) #data를 학습이 가능한 형태로 변환.(각 형태소 및 태그를 형태소 단위로 변환) Dev = create_data(Dev_data,word2idx,r_dic) Te = create_data(Te_data,word2idx,r_dic) #Pointer Networks 모델 생성. Model = PointerNetworks(word_dim = parameters["word_dim"],\ lstm_width=parameters["lstm_dim"], #lstm dimensions nword=len(word2idx), weights = matrix, drop_rate = parameters["dropout"]) #드랍아웃 파라미터. print ("Model Create...") optimizer = torch.optim.Adam(Model.parameters(), lr=0.0001) #optimizer 정의. learning rate는 0.0001 max_v = -100 # dev data를 이용하여 가장 높은 성능을 기록하는 변수. Model.train(True) for e in range(opts.epoch): losses = [] #random.shuffle(Tr) for i in range(len(Tr)): input = Tr[i] Model.zero_grad() train_dic = {"inputs":input["morphemes"],\ "prime_idx":input["prime_idx"],\ "istrain" : 1,\ "drop_rate" : 0.2,\ "point_idx":input["point_idx"],\ "pointings":input["pointings"]} _,predict_points,predict_labels,_1 = Model(train_dic) answer_point = input["point_idx"] answer_point = torch.LongTensor(answer_point) answer_label = input["label_idx"] answer_label = torch.LongTensor(answer_label) cost1 = torch.nn.functional.cross_entropy(predict_points,Variable(answer_point)) #의존 관계에 대한 Loss계산. cost2 = torch.nn.functional.cross_entropy(predict_labels,Variable(answer_label)) #의존 관계명에 대한 Loss 계산. costs = 0.8*cost1 + 0.2*cost2 #cost1과 cost2의 반영 비율에 따라 계산. costs.backward() torch.nn.utils.clip_grad_norm(Model.parameters(), 5.0) optimizer.step() losses.append(float(costs)) #한 문장에 대한 loss append. if len(losses) == 3000: #1000문장당 loss출력 및 dev data 평가, 성능이 오를때마다 model save. print ("Epoch : "+str(e)+", sentence_num : "+str(i+1)+", average_loss : "+str(round(sum(losses)/len(losses),2))+", max_uas : "+str(max_v)) losses = [] Model.train(False) if e > 2: uas,las = evaluation(Model,Dev) #dev data 평가. if uas > max_v: # uas가 max_v보다 높으면 max_v 교체. max_v = uas print ("Best Model chanes ... ") torch.save(Model,"BestModel.pt") #model save. Model.train(True)
def DirectAttributePrediction(classifier='SVM', predicate_type='binary', C=10.0): # Get features index to recover samples train_index = bzUnpickle('./CreatedData/train_features_index.txt') test_index = bzUnpickle('./CreatedData/test_features_index.txt') # Get classes-attributes relationship train_attributes = get_class_attributes('./', name='train', predicate_type=predicate_type) test_attributes = get_class_attributes('./', name='test', predicate_type=predicate_type) N_ATTRIBUTES = train_attributes.shape[1] # Create training Dataset print('Creating training dataset...') X_train, y_train = create_data('./CreatedData/train_featuresVGG19.pic.bz2', train_index, train_attributes) print('X_train to dense...') X_train = X_train.toarray() Xplat_train, Xplat_val, yplat_train, yplat_val = train_test_split( X_train, y_train, test_size=0.10, random_state=42) print('Creating test dataset...') X_test, y_test = create_data('./CreatedData/test_featuresVGG19.pic.bz2', test_index, test_attributes) y_pred = np.zeros(y_test.shape) y_proba = np.copy(y_pred) print('X_test to dense...') X_test = X_test.toarray() # CHOOSING SVM if classifier == 'SVM': platt_params = [] for i in range(N_ATTRIBUTES): print('--------- Attribute %d/%d ---------' % (i + 1, N_ATTRIBUTES)) t0 = time() # SVM classifier if predicate_type == 'binary': clf = SVMClassifier() else: clf = SVMRegressor() # Training clf.fit(X_train, y_train[:, i]) print('Fitted classifier in: %fs' % (time() - t0)) if predicate_type == 'binary': clf.set_platt_params(Xplat_val, yplat_val[:, i]) # Predicting print('Predicting for attribute %d...' % (i + 1)) y_pred[:, i] = clf.predict(X_test) if predicate_type == 'binary': y_proba[:, i] = clf.predict_proba(X_test) print('Saving files...') np.savetxt('./DAP_' + predicate_type + '/prediction_SVM', y_pred) if predicate_type == 'binary': np.savetxt('./DAP_' + predicate_type + '/platt_params_SVM', platt_params) np.savetxt('./DAP_' + predicate_type + '/probabilities_SVM', y_proba) # CHOOSING NEURAL NETWORK if classifier == 'NN': if predicate_type != 'binary': clf = NeuralNetworkRegressor(dim_features=X_train.shape[1], nb_attributes=N_ATTRIBUTES) else: clf = NeuralNetworkClassifier(dim_features=X_train.shape[1], nb_attributes=N_ATTRIBUTES) print('Fitting Neural Network...') clf.fit(X_train, y_train) print('Predicting attributes...') y_pred = np.array(clf.predict(X_test)) y_pred = y_pred.reshape((y_pred.shape[0], y_pred.shape[1])).T y_proba = y_pred print('Saving files...') np.savetxt('./DAP_' + predicate_type + '/prediction_NN', y_pred) if predicate_type == 'binary': np.savetxt('./DAP_' + predicate_type + '/probabilities_NN', y_proba)
# DATA PREPROCESSING path_tr = "../mentions_dumps/{}/mentions_tr.txt".format(lang) path_tr_hr = "../mentions_dumps/{}/mentions_tr.txt".format(hr_lang) if not os.path.exists("data/{}".format(lang)): os.mkdir("data/{}".format(lang)) if weight_hr == -1: # Zero-shot data_info = "data/{}/info_ngram_lookup_ZS_hr={}.pkl".format(lang, hr_lang) else: data_info = "data/{}/info_ngram_lookup_hr={}.pkl".format(lang, hr_lang) if os.path.exists(data_info): mnt2ent_hr, mnt2ent_lr, ent2ind = _pickle.load(open(data_info, "rb")) else: mnt2ent_hr, mnt2ent_lr, ent2ind = create_data(path_tr, path_tr_hr, data_info, weight_hr) # N-GRAMS TOKENIZER if weight_hr == -1: # Zero-shot path_tkn = "data/{}/ngram_lookup_ZS_hr={}_vocabulary.pkl".format( lang, hr_lang) path_cnt_hr = "data/{}/ngram_counter_ZS_{}.pkl".format(lang, hr_lang) path_cnt_lr = "data/{}/ngram_counter_ZS_{}.pkl".format(lang, lang) else: path_tkn = "data/{}/ngram_lookup_hr={}_vocabulary.pkl".format( lang, hr_lang) path_cnt_hr = "data/{}/ngram_counter_{}.pkl".format(lang, hr_lang) path_cnt_lr = "data/{}/ngram_counter_{}.pkl".format(lang, lang) data_tr = [mnt2ent_hr, mnt2ent_lr] if os.path.exists(path_tkn):
def get_data(self): return utils.create_data()
import os from utils import create_data import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='Preprocess corpus dataset') parser.add_argument('--folder_path', type=str, required=True, help='required path to questions') parser.add_argument('--output', type=str, required=True, help='data output filename') args = parser.parse_args() for data_type in ['training', 'test']: create_data(os.path.join(args.folder_path, 'questions/' + data_type), data_type + '_' + args.output)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ if __name__ == '__main__': # -------------------------------------------------------------------------- # initialize the logger FMT_LOG = '%(asctime)s - %(name)s:%(funcName)s:%(lineno)s - %(levelname)s - %(message)s' sh = logging.StreamHandler() sh.setFormatter(logging.Formatter(FMT_LOG)) LOGGER.setLevel(logging.DEBUG) LOGGER.addHandler(sh) # -------------------------------------------------------------------------- # create a fake 2D dataset x, _, _, _ = create_data() # only the first return value is relevant LOGGER.info('Created {} data'.format(x.shape)) # let's split this data into three parts ndata, dim = x.shape[0], x.shape[1] a, b = ndata // 2, ndata // 2 + ndata // 4 x1, x2, x3 = x[:a], x[a:b], x[b:] LOGGER.info('Split the data: {}, {}, and {}'.format( x1.shape, x2.shape, x3.shape)) # -------------------------------------------------------------------------- # here, we will create a dynim hdspace with approx metric # we will train this hdspace using some data and save (as a faiss index) # nlist and nprobes are the key parameters that define
import numpy as np import scipy as sp from sklearn.cluster import KMeans import utils if __name__ == '__main__': K = 4 X_norm, z = utils.create_data() X_norm = np.concatenate((X_norm, X_norm + (0, 3.2))) z = np.concatenate((z, z + 2)) A = utils.create_affinity_matrix(X_norm) Q = utils.create_constraint_matrix(z) D = np.diag(np.sum(A, axis=1)) vol = np.sum(A) D_norm = np.linalg.inv(np.sqrt(D)) L_norm = np.eye(*A.shape) - D_norm.dot(A.dot(D_norm)) Q_norm = D_norm.dot(Q.dot(D_norm)) # alpha < K-th eigenval of Q_norm alpha = 0.6 * sp.linalg.svdvals(Q_norm)[K] Q1 = Q_norm - alpha * np.eye(*Q_norm.shape) val, vec = sp.linalg.eig(L_norm, Q1) vec = vec[:,val >= 0] vec_norm = (vec / np.linalg.norm(vec, axis=0)) * np.sqrt(vol) costs = np.multiply(vec_norm.T.dot(L_norm), vec_norm.T).sum(axis=1)