Exemplo n.º 1
0
def predict_missing_links(train_file_path, evaluation_file_path, model_path,
                          tensorboard_visualizations_path):
    graph = load_from_csv('.', train_file_path, sep=',')
    evaluation_samples = load_from_csv('.', evaluation_file_path, sep=',')

    print('Head of the loaded graph: ')
    print(graph[:5])

    train_samples, test_samples = split(graph)
    print(
        f'Divided into train and test subsets with shapes {train_samples.shape} and {test_samples.shape} respectively.'
    )

    if not os.path.isfile(model_path):
        model = train_transe(train_samples)  # train_complex(train_samples)
        save_model(model, model_path)
    else:
        model = restore_model(model_path)

    metrics = compute_metrics(model, train_samples, test_samples)
    print(f'{"metric":10s}: {"score":5s}')
    for metric, score in metrics.items():
        print(f'{metric:10s}: {score:<5.2f}')

    scores, ranks = score_samples(model, evaluation_samples, train_samples)
    evaluation_summary = summarize(scores, evaluation_samples, ranks)

    print(evaluation_summary)

    if tensorboard_visualizations_path:
        os.makedirs(tensorboard_visualizations_path, exist_ok=True)
        create_tensorboard_visualizations(model,
                                          tensorboard_visualizations_path)
Exemplo n.º 2
0
def test_convkb_save_restore():

    model = ConvKB(batches_count=2,
                   seed=22,
                   epochs=1,
                   k=10,
                   eta=1,
                   embedding_model_params={
                       'num_filters': 16,
                       'filter_sizes': [1],
                       'dropout': 0.0,
                       'is_trainable': True
                   },
                   optimizer='adam',
                   optimizer_params={'lr': 0.001},
                   loss='pairwise',
                   loss_params={},
                   verbose=True)

    X = load_wn18()
    model.fit(X['train'])
    y1 = model.predict(X['test'][:10])

    save_model(model, 'convkb.tmp')
    del model
    model = restore_model('convkb.tmp')

    y2 = model.predict(X['test'][:10])

    assert np.all(y1 == y2)

    os.remove('convkb.tmp')
Exemplo n.º 3
0
def test_conve_evaluation_protocol():
    X = load_wn18()
    model = ConvE(batches_count=200,
                  seed=22,
                  epochs=1,
                  k=10,
                  embedding_model_params={
                      'conv_filters': 16,
                      'conv_kernel_size': 3
                  },
                  optimizer='adam',
                  optimizer_params={'lr': 0.01},
                  loss='bce',
                  loss_params={},
                  regularizer=None,
                  regularizer_params={
                      'p': 2,
                      'lambda': 1e-5
                  },
                  verbose=True,
                  low_memory=True)

    model.fit(X['train'])

    y1 = model.predict(X['test'][:5])

    save_model(model, 'model.tmp')
    del model
    model = restore_model('model.tmp')

    y2 = model.predict(X['test'][:5])

    assert np.all(y1 == y2)

    os.remove('model.tmp')
Exemplo n.º 4
0
def kge(triples, kge_name, epochs, batch_size, learning_rate, seed, verbose):
    kge_name = parsed_args.kge
    kge_model_savepath = f'./temp/ampligraph.model'

    if not os.path.isfile(kge_model_savepath):
        #Embedding evaluation
        if verbose:
            # Train test split
            t_size = math.ceil(len(triples) * 0.2)
            X_train, X_test = train_test_split_no_unseen(triples,
                                                         test_size=t_size)

            eval_model = select_kge(kge_name, batch_size, epochs, seed,
                                    verbose)

            eval_model.fit(X_train)
            filter_triples = np.concatenate((X_train, X_test))
            ranks = evaluate_performance(X_test,
                                         model=eval_model,
                                         filter_triples=filter_triples,
                                         use_default_protocol=True,
                                         verbose=True)

            mrr = mrr_score(ranks)
            print("MRR: %.2f" % (mrr))
            mr = mr_score(ranks)
            print("MR: %.2f" % (mr))
            hits_10 = hits_at_n_score(ranks, n=10)
            print("Hits@10: %.2f" % (hits_10))
            hits_3 = hits_at_n_score(ranks, n=3)
            print("Hits@3: %.2f" % (hits_3))
            hits_1 = hits_at_n_score(ranks, n=1)
            print("Hits@1: %.2f" % (hits_1))

            print('''
            - Ampligraph example -
            MRR: 0.25
            MR: 4927.33
            Hits@10: 0.35
            Hits@3: 0.28
            Hits@1: 0.19
            ''')

        model = select_kge(kge_name, batch_size, epochs, seed, verbose)

        print('Training...')
        model.fit(np.array(triples))
        save_model(model, model_name_path=kge_model_savepath)
    else:
        model = restore_model(model_name_path=kge_model_savepath)

    return model
Exemplo n.º 5
0
def test_save_and_restore_model():
    models = ('ComplEx', 'TransE', 'DistMult')

    for model_name in models:
        module = importlib.import_module("ampligraph.latent_features.models")

        print('Doing save/restore testing for model class: ', model_name)

        class_ = getattr(module, model_name)

        model = class_(batches_count=2,
                       seed=555,
                       epochs=20,
                       k=10,
                       optimizer='adagrad',
                       optimizer_params={'lr': 0.1})

        X = np.array([['a', 'y', 'b'], ['b', 'y', 'a'], ['a', 'y', 'c'],
                      ['c', 'y', 'a'], ['a', 'y', 'd'], ['c', 'y', 'd'],
                      ['b', 'y', 'c'], ['f', 'y', 'e']])

        model.fit(X)

        example_name = 'helloworld.pkl'

        save_model(model, model_name_path=example_name)

        loaded_model = restore_model(model_name_path=example_name)

        assert loaded_model != None
        assert loaded_model.all_params == model.all_params
        assert loaded_model.is_fitted == model.is_fitted
        assert loaded_model.ent_to_idx == model.ent_to_idx
        assert loaded_model.rel_to_idx == model.rel_to_idx

        for i in range(len(loaded_model.trained_model_params)):
            npt.assert_array_equal(loaded_model.trained_model_params[i],
                                   model.trained_model_params[i])

        y_pred_before, _ = model.predict(np.array([['f', 'y', 'e'],
                                                   ['b', 'y', 'd']]),
                                         get_ranks=True)
        y_pred_after, _ = loaded_model.predict(np.array([['f', 'y', 'e'],
                                                         ['b', 'y', 'd']]),
                                               get_ranks=True)
        npt.assert_array_equal(y_pred_after, y_pred_before)

        npt.assert_array_equal(
            loaded_model.get_embeddings(['a', 'b'], embedding_type='entity'),
            model.get_embeddings(['a', 'b'], embedding_type='entity'))

        os.remove(example_name)
    def __init__(self,
                 vocab: Vocabulary,
                 projection_dim: int = 10,
                 model_path: str = "",
                 ignore_oov=True):
        super(NymEmbedder, self).__init__()

        self.vocab = vocab
        self._ignore_oov = ignore_oov
        self._oov_idx = 0
        self.output_dim = projection_dim
        self.model = restore_model(model_name_path=model_path)

        self.oov_pad_vec = np.full(self.get_output_dim(),
                                   1 / self.get_output_dim())
    def __init__(self,
                 vocab: Vocabulary,
                 projection_dim: int = 10,
                 model_path: str = "",
                 ignore_oov=True):
        super(NymEmbedder, self).__init__()

        with timeit_context('initializing knowledge embedder'):
            self.vocab = vocab

            self._ignore_oov = ignore_oov
            oov_token = vocab._oov_token
            self._oov_idx = 0
            self.output_dim = projection_dim
            self.model = restore_model(model_name_path=model_path)

            self.oov_pad_vec = np.full(self.get_output_dim(),
                                       1 / self.get_output_dim())
Exemplo n.º 8
0
    def __init__(self):
        self.num_dimensions = 700
        self.df_merged = pickle_in('df_merged')

        self.song_hash = pickle_in('song_hash')
        self.y_pred = pickle_in('y_pred_user_8332014')
        self.unseen_predict = pickle_in('unseen_predict')
        #self.default_user_id = 'user_8332014'

        user_recommend_list = [
            (t_item[2], score)
            for t_item, score in zip(self.unseen_predict, self.y_pred)
        ]
        self.user_recommend_list = sorted(user_recommend_list,
                                          key=lambda KV: KV[1],
                                          reverse=True)
        self.model = restore_model(
            model_name_path='./model/complex_model_opt_lf.pkl')
        pass
Exemplo n.º 9
0
def test_conve_fit_predict_save_restore():

    X = np.array([['a', 'y', 'b'], ['b', 'y', 'a'], ['a', 'y', 'c'],
                  ['c', 'y', 'a'], ['a', 'y', 'd'], ['c', 'y', 'd'],
                  ['b', 'y', 'c'], ['f', 'y', 'e']])

    X_test = np.array([['f', 'y', 'a'], ['f', 'y', 'b']])

    model = ConvE(batches_count=1,
                  seed=22,
                  epochs=1,
                  k=10,
                  embedding_model_params={
                      'conv_filters': 16,
                      'conv_kernel_size': 3
                  },
                  optimizer='adam',
                  optimizer_params={'lr': 0.01},
                  loss='bce',
                  loss_params={},
                  regularizer=None,
                  regularizer_params={
                      'p': 2,
                      'lambda': 1e-5
                  },
                  verbose=True,
                  low_memory=True)

    model.fit(X)

    y1 = model.predict(X_test)
    print(y1)

    save_model(model, 'model.tmp')
    del model
    model = restore_model('model.tmp')

    y2 = model.predict(X_test)

    assert np.all(y1 == y2)
    os.remove('model.tmp')
Exemplo n.º 10
0
def test_convkb_save_restore():

    X = np.array([['a', 'y', 'b'], ['b', 'y', 'a'], ['a', 'y', 'c'],
                  ['c', 'y', 'a'], ['a', 'y', 'd'], ['c', 'y', 'd'],
                  ['b', 'y', 'c'], ['f', 'y', 'e']])

    X_test = np.array([['f', 'y', 'a'], ['f', 'y', 'b']])

    model = ConvKB(batches_count=1,
                   seed=22,
                   epochs=1,
                   k=10,
                   eta=1,
                   embedding_model_params={
                       'num_filters': 16,
                       'filter_sizes': [1],
                       'dropout': 0.0,
                       'is_trainable': True
                   },
                   optimizer='adam',
                   optimizer_params={'lr': 0.001},
                   loss='pairwise',
                   loss_params={},
                   verbose=True)

    model.fit(X)
    y1 = model.predict(X_test)

    save_model(model, 'convkb.tmp')
    del model
    model = restore_model('convkb.tmp')

    y2 = model.predict(X_test)

    assert np.all(y1 == y2)

    os.remove('convkb.tmp')
Exemplo n.º 11
0
def hash_multi_plane_matrix(P, v, num_planes):
    sides_matrix = side_of_plane_matrix(P, v) # Get the side of planes for P and v
    hash_value = 0
    for i in range(num_planes):
        sign = sides_matrix[i].item() # Get the value inside the matrix cell
        hash_i = 1 if sign >=0 else 0
        hash_value += 2**i * hash_i # sum 2^i * hash_i
    return hash_value


if __name__ == '__main__':

    np.random.seed(0)
    num_dimensions = 700
    num_planes = 25
    total_song_list = pickle_in('total_song_list')
    model = restore_model(model_path = './model/complex_model_opt_lf.pkl')

    random_planes_matrix = np.random.normal(size=(num_planes, num_dimensions))

    song_hash = {}
    for song in total_song_list:
        v = model.get_embeddings(song)
        hash_value = hash_multi_plane_matrix(random_planes_matrix, v, num_planes)
        if hash_value in song_hash:
            song_hash[hash_value].append(song)
        else:
            song_hash[hash_value] = [song]

    pickle_out('song_hash', song_hash)
Exemplo n.º 12
0
import numpy as np
import pandas as pd
from ampligraph.utils import restore_model
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

df = pd.read_pickle(
    "../Data/reducedDataset001.pkl")  #Recuperar dataset limpio y reducido
df["train"] = df.Sujeto > "S14"  #Elegir datos de entrenamiento
model = restore_model(
    "../Data/KGEmbedModel007.pkl")  #Recuperar embedding model
df["data_id"] = df.index.values.astype(str)
df["data_id"] = "Dato" + df.data_id  #Crear nuevo campo para recuperar las entidades del modelo
df["subject_id"] = df.Sujeto.values.astype(str)

data = (df.data_id).unique()
data_embeddings = dict(
    zip(data, model.get_embeddings(data))
)  #Diccionario con clave data_id y valor la entidad recuperada del modelo


#Funcion para obtener dos arrays de datos, features y valores a predecir
def get_features_target(mask):
    def get_embeddings(dato):
        return data_embeddings.get(
            dato, np.full(250, np.nan)
        )  #Devuelve array de tamaño 2*k siendo k la dimensionalidad indicada en el embedding model

    X = np.vstack(df[mask].data_id.apply(get_embeddings).values)
Exemplo n.º 13
0
import csv
import pandas as pd
from ampligraph.utils import restore_model
# Leer datasets y modelos
df = pd.read_pickle("../Data/reducedDataset001.pkl")
df["train"] = df.Sujeto > "S14"
model = restore_model("../Data/KGEmbedModel004.pkl")
df["data_id"] = df.index.values.astype(str)
df["data_id"] = "Dato" + df.data_id
df["subject_id"] = df.Sujeto.values.astype(str)
#Crear diccionario a guardar
data = (df.data_id).unique()
data_embeddings = dict(zip(data, model.get_embeddings(data)))
#encodedNumpyData = json.dumps(data_embeddings, cls=NumpyArrayEncoder)
file_path = "../Data/EmbeddingsDict.csv"
#json.dump( encodedNumpyData, open(file_path, 'w'))
#json.dump(data_embeddings, codecs.open(file_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4)
#f = open(file_path,"wb")
#pickle.dump(data_embeddings,f)
#f.close()
#w = csv.writer(open(file_path, "w"))
#for key, val in data_embeddings.items():
#    w.writerow([key, val])
with open(file_path, "w") as outfile:
   writer = csv.writer(outfile)
   writer.writerow(data_embeddings.keys())
   writer.writerows(zip(*data_embeddings.values()))
Exemplo n.º 14
0
def test_restore_model_errors():

    with pytest.raises(FileNotFoundError):
        model = restore_model(model_name_path='filenotfound.model')
    save_model(model2, model_name_path=ke_model_path + '2')

    #filter_triples = np.concatenate((X_train, X_valid))
    #filter = np.concatenate((X['train'], X['valid'], X['test']))
    #ranks = evaluate_performance(X['test'],
    #                             model=model,
    #                             filter_triples=filter,
    #                             use_default_protocol=True,  # corrupt subj and obj separately while evaluating
    #                             verbose=True)

    #mrr = mrr_score(ranks)
    #hits_10 = hits_at_n_score(ranks, n=10)
    #print("MRR: %f, Hits@10: %f" % (mrr, hits_10))
    # Output: MRR: 0.886406, Hits@10: 0.935000
else:
    model = restore_model(model_name_path=ke_model_path)
    model2 = restore_model(model_name_path=ke_model_path + '2')

    import pickle

    with open(ke_wnkeys_path, 'rb') as handle:
        tok2id, id2tok = pickle.load(handle)

import pprint


def find_in_tok2id(w):
    for s in tok2id.keys():
        if w in s:
            print(w, s, "it is alphabetically there")
Exemplo n.º 16
0
                    loss='multiclass_nll',
                    regularizer='LP',
                    regularizer_params={
                        'p': 3,
                        'lambda': 1e-5
                    },
                    seed=0,
                    verbose=True)

    print("Training...")
    model.fit(X_train)
    save_model(model, model_name_path=ke_model_path)

    filter_triples = np.concatenate((X_train, X_valid))
else:
    model = restore_model(model_name_path=ke_model_path)

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
from incf.countryutils import transformations

print("Extracting Embeddings..")

id_to_name_map = {
    **dict(zip(df.home_team_id, df.home_team)),
    **dict(zip(df.away_team_id, df.away_team))
}

teams = pd.concat(