def predict_missing_links(train_file_path, evaluation_file_path, model_path, tensorboard_visualizations_path): graph = load_from_csv('.', train_file_path, sep=',') evaluation_samples = load_from_csv('.', evaluation_file_path, sep=',') print('Head of the loaded graph: ') print(graph[:5]) train_samples, test_samples = split(graph) print( f'Divided into train and test subsets with shapes {train_samples.shape} and {test_samples.shape} respectively.' ) if not os.path.isfile(model_path): model = train_transe(train_samples) # train_complex(train_samples) save_model(model, model_path) else: model = restore_model(model_path) metrics = compute_metrics(model, train_samples, test_samples) print(f'{"metric":10s}: {"score":5s}') for metric, score in metrics.items(): print(f'{metric:10s}: {score:<5.2f}') scores, ranks = score_samples(model, evaluation_samples, train_samples) evaluation_summary = summarize(scores, evaluation_samples, ranks) print(evaluation_summary) if tensorboard_visualizations_path: os.makedirs(tensorboard_visualizations_path, exist_ok=True) create_tensorboard_visualizations(model, tensorboard_visualizations_path)
def test_convkb_save_restore(): model = ConvKB(batches_count=2, seed=22, epochs=1, k=10, eta=1, embedding_model_params={ 'num_filters': 16, 'filter_sizes': [1], 'dropout': 0.0, 'is_trainable': True }, optimizer='adam', optimizer_params={'lr': 0.001}, loss='pairwise', loss_params={}, verbose=True) X = load_wn18() model.fit(X['train']) y1 = model.predict(X['test'][:10]) save_model(model, 'convkb.tmp') del model model = restore_model('convkb.tmp') y2 = model.predict(X['test'][:10]) assert np.all(y1 == y2) os.remove('convkb.tmp')
def test_conve_evaluation_protocol(): X = load_wn18() model = ConvE(batches_count=200, seed=22, epochs=1, k=10, embedding_model_params={ 'conv_filters': 16, 'conv_kernel_size': 3 }, optimizer='adam', optimizer_params={'lr': 0.01}, loss='bce', loss_params={}, regularizer=None, regularizer_params={ 'p': 2, 'lambda': 1e-5 }, verbose=True, low_memory=True) model.fit(X['train']) y1 = model.predict(X['test'][:5]) save_model(model, 'model.tmp') del model model = restore_model('model.tmp') y2 = model.predict(X['test'][:5]) assert np.all(y1 == y2) os.remove('model.tmp')
def kge(triples, kge_name, epochs, batch_size, learning_rate, seed, verbose): kge_name = parsed_args.kge kge_model_savepath = f'./temp/ampligraph.model' if not os.path.isfile(kge_model_savepath): #Embedding evaluation if verbose: # Train test split t_size = math.ceil(len(triples) * 0.2) X_train, X_test = train_test_split_no_unseen(triples, test_size=t_size) eval_model = select_kge(kge_name, batch_size, epochs, seed, verbose) eval_model.fit(X_train) filter_triples = np.concatenate((X_train, X_test)) ranks = evaluate_performance(X_test, model=eval_model, filter_triples=filter_triples, use_default_protocol=True, verbose=True) mrr = mrr_score(ranks) print("MRR: %.2f" % (mrr)) mr = mr_score(ranks) print("MR: %.2f" % (mr)) hits_10 = hits_at_n_score(ranks, n=10) print("Hits@10: %.2f" % (hits_10)) hits_3 = hits_at_n_score(ranks, n=3) print("Hits@3: %.2f" % (hits_3)) hits_1 = hits_at_n_score(ranks, n=1) print("Hits@1: %.2f" % (hits_1)) print(''' - Ampligraph example - MRR: 0.25 MR: 4927.33 Hits@10: 0.35 Hits@3: 0.28 Hits@1: 0.19 ''') model = select_kge(kge_name, batch_size, epochs, seed, verbose) print('Training...') model.fit(np.array(triples)) save_model(model, model_name_path=kge_model_savepath) else: model = restore_model(model_name_path=kge_model_savepath) return model
def test_save_and_restore_model(): models = ('ComplEx', 'TransE', 'DistMult') for model_name in models: module = importlib.import_module("ampligraph.latent_features.models") print('Doing save/restore testing for model class: ', model_name) class_ = getattr(module, model_name) model = class_(batches_count=2, seed=555, epochs=20, k=10, optimizer='adagrad', optimizer_params={'lr': 0.1}) X = np.array([['a', 'y', 'b'], ['b', 'y', 'a'], ['a', 'y', 'c'], ['c', 'y', 'a'], ['a', 'y', 'd'], ['c', 'y', 'd'], ['b', 'y', 'c'], ['f', 'y', 'e']]) model.fit(X) example_name = 'helloworld.pkl' save_model(model, model_name_path=example_name) loaded_model = restore_model(model_name_path=example_name) assert loaded_model != None assert loaded_model.all_params == model.all_params assert loaded_model.is_fitted == model.is_fitted assert loaded_model.ent_to_idx == model.ent_to_idx assert loaded_model.rel_to_idx == model.rel_to_idx for i in range(len(loaded_model.trained_model_params)): npt.assert_array_equal(loaded_model.trained_model_params[i], model.trained_model_params[i]) y_pred_before, _ = model.predict(np.array([['f', 'y', 'e'], ['b', 'y', 'd']]), get_ranks=True) y_pred_after, _ = loaded_model.predict(np.array([['f', 'y', 'e'], ['b', 'y', 'd']]), get_ranks=True) npt.assert_array_equal(y_pred_after, y_pred_before) npt.assert_array_equal( loaded_model.get_embeddings(['a', 'b'], embedding_type='entity'), model.get_embeddings(['a', 'b'], embedding_type='entity')) os.remove(example_name)
def __init__(self, vocab: Vocabulary, projection_dim: int = 10, model_path: str = "", ignore_oov=True): super(NymEmbedder, self).__init__() self.vocab = vocab self._ignore_oov = ignore_oov self._oov_idx = 0 self.output_dim = projection_dim self.model = restore_model(model_name_path=model_path) self.oov_pad_vec = np.full(self.get_output_dim(), 1 / self.get_output_dim())
def __init__(self, vocab: Vocabulary, projection_dim: int = 10, model_path: str = "", ignore_oov=True): super(NymEmbedder, self).__init__() with timeit_context('initializing knowledge embedder'): self.vocab = vocab self._ignore_oov = ignore_oov oov_token = vocab._oov_token self._oov_idx = 0 self.output_dim = projection_dim self.model = restore_model(model_name_path=model_path) self.oov_pad_vec = np.full(self.get_output_dim(), 1 / self.get_output_dim())
def __init__(self): self.num_dimensions = 700 self.df_merged = pickle_in('df_merged') self.song_hash = pickle_in('song_hash') self.y_pred = pickle_in('y_pred_user_8332014') self.unseen_predict = pickle_in('unseen_predict') #self.default_user_id = 'user_8332014' user_recommend_list = [ (t_item[2], score) for t_item, score in zip(self.unseen_predict, self.y_pred) ] self.user_recommend_list = sorted(user_recommend_list, key=lambda KV: KV[1], reverse=True) self.model = restore_model( model_name_path='./model/complex_model_opt_lf.pkl') pass
def test_conve_fit_predict_save_restore(): X = np.array([['a', 'y', 'b'], ['b', 'y', 'a'], ['a', 'y', 'c'], ['c', 'y', 'a'], ['a', 'y', 'd'], ['c', 'y', 'd'], ['b', 'y', 'c'], ['f', 'y', 'e']]) X_test = np.array([['f', 'y', 'a'], ['f', 'y', 'b']]) model = ConvE(batches_count=1, seed=22, epochs=1, k=10, embedding_model_params={ 'conv_filters': 16, 'conv_kernel_size': 3 }, optimizer='adam', optimizer_params={'lr': 0.01}, loss='bce', loss_params={}, regularizer=None, regularizer_params={ 'p': 2, 'lambda': 1e-5 }, verbose=True, low_memory=True) model.fit(X) y1 = model.predict(X_test) print(y1) save_model(model, 'model.tmp') del model model = restore_model('model.tmp') y2 = model.predict(X_test) assert np.all(y1 == y2) os.remove('model.tmp')
def test_convkb_save_restore(): X = np.array([['a', 'y', 'b'], ['b', 'y', 'a'], ['a', 'y', 'c'], ['c', 'y', 'a'], ['a', 'y', 'd'], ['c', 'y', 'd'], ['b', 'y', 'c'], ['f', 'y', 'e']]) X_test = np.array([['f', 'y', 'a'], ['f', 'y', 'b']]) model = ConvKB(batches_count=1, seed=22, epochs=1, k=10, eta=1, embedding_model_params={ 'num_filters': 16, 'filter_sizes': [1], 'dropout': 0.0, 'is_trainable': True }, optimizer='adam', optimizer_params={'lr': 0.001}, loss='pairwise', loss_params={}, verbose=True) model.fit(X) y1 = model.predict(X_test) save_model(model, 'convkb.tmp') del model model = restore_model('convkb.tmp') y2 = model.predict(X_test) assert np.all(y1 == y2) os.remove('convkb.tmp')
def hash_multi_plane_matrix(P, v, num_planes): sides_matrix = side_of_plane_matrix(P, v) # Get the side of planes for P and v hash_value = 0 for i in range(num_planes): sign = sides_matrix[i].item() # Get the value inside the matrix cell hash_i = 1 if sign >=0 else 0 hash_value += 2**i * hash_i # sum 2^i * hash_i return hash_value if __name__ == '__main__': np.random.seed(0) num_dimensions = 700 num_planes = 25 total_song_list = pickle_in('total_song_list') model = restore_model(model_path = './model/complex_model_opt_lf.pkl') random_planes_matrix = np.random.normal(size=(num_planes, num_dimensions)) song_hash = {} for song in total_song_list: v = model.get_embeddings(song) hash_value = hash_multi_plane_matrix(random_planes_matrix, v, num_planes) if hash_value in song_hash: song_hash[hash_value].append(song) else: song_hash[hash_value] = [song] pickle_out('song_hash', song_hash)
import numpy as np import pandas as pd from ampligraph.utils import restore_model from sklearn import metrics from xgboost import XGBClassifier from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml df = pd.read_pickle( "../Data/reducedDataset001.pkl") #Recuperar dataset limpio y reducido df["train"] = df.Sujeto > "S14" #Elegir datos de entrenamiento model = restore_model( "../Data/KGEmbedModel007.pkl") #Recuperar embedding model df["data_id"] = df.index.values.astype(str) df["data_id"] = "Dato" + df.data_id #Crear nuevo campo para recuperar las entidades del modelo df["subject_id"] = df.Sujeto.values.astype(str) data = (df.data_id).unique() data_embeddings = dict( zip(data, model.get_embeddings(data)) ) #Diccionario con clave data_id y valor la entidad recuperada del modelo #Funcion para obtener dos arrays de datos, features y valores a predecir def get_features_target(mask): def get_embeddings(dato): return data_embeddings.get( dato, np.full(250, np.nan) ) #Devuelve array de tamaño 2*k siendo k la dimensionalidad indicada en el embedding model X = np.vstack(df[mask].data_id.apply(get_embeddings).values)
import csv import pandas as pd from ampligraph.utils import restore_model # Leer datasets y modelos df = pd.read_pickle("../Data/reducedDataset001.pkl") df["train"] = df.Sujeto > "S14" model = restore_model("../Data/KGEmbedModel004.pkl") df["data_id"] = df.index.values.astype(str) df["data_id"] = "Dato" + df.data_id df["subject_id"] = df.Sujeto.values.astype(str) #Crear diccionario a guardar data = (df.data_id).unique() data_embeddings = dict(zip(data, model.get_embeddings(data))) #encodedNumpyData = json.dumps(data_embeddings, cls=NumpyArrayEncoder) file_path = "../Data/EmbeddingsDict.csv" #json.dump( encodedNumpyData, open(file_path, 'w')) #json.dump(data_embeddings, codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) #f = open(file_path,"wb") #pickle.dump(data_embeddings,f) #f.close() #w = csv.writer(open(file_path, "w")) #for key, val in data_embeddings.items(): # w.writerow([key, val]) with open(file_path, "w") as outfile: writer = csv.writer(outfile) writer.writerow(data_embeddings.keys()) writer.writerows(zip(*data_embeddings.values()))
def test_restore_model_errors(): with pytest.raises(FileNotFoundError): model = restore_model(model_name_path='filenotfound.model')
save_model(model2, model_name_path=ke_model_path + '2') #filter_triples = np.concatenate((X_train, X_valid)) #filter = np.concatenate((X['train'], X['valid'], X['test'])) #ranks = evaluate_performance(X['test'], # model=model, # filter_triples=filter, # use_default_protocol=True, # corrupt subj and obj separately while evaluating # verbose=True) #mrr = mrr_score(ranks) #hits_10 = hits_at_n_score(ranks, n=10) #print("MRR: %f, Hits@10: %f" % (mrr, hits_10)) # Output: MRR: 0.886406, Hits@10: 0.935000 else: model = restore_model(model_name_path=ke_model_path) model2 = restore_model(model_name_path=ke_model_path + '2') import pickle with open(ke_wnkeys_path, 'rb') as handle: tok2id, id2tok = pickle.load(handle) import pprint def find_in_tok2id(w): for s in tok2id.keys(): if w in s: print(w, s, "it is alphabetically there")
loss='multiclass_nll', regularizer='LP', regularizer_params={ 'p': 3, 'lambda': 1e-5 }, seed=0, verbose=True) print("Training...") model.fit(X_train) save_model(model, model_name_path=ke_model_path) filter_triples = np.concatenate((X_train, X_valid)) else: model = restore_model(model_name_path=ke_model_path) from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns from adjustText import adjust_text from incf.countryutils import transformations print("Extracting Embeddings..") id_to_name_map = { **dict(zip(df.home_team_id, df.home_team)), **dict(zip(df.away_team_id, df.away_team)) } teams = pd.concat(