예제 #1
0
def score_analogy(analogy_fname,
                  embeddings_path,
                  lang,
                  emb_dim,
                  max_vocab=200000,
                  lower=True,
                  cuda=True):

    # source embeddings
    params = Params()
    params.src_emb = embeddings_path
    params.tgt_emb = ''
    params.max_vocab = max_vocab
    params.emb_dim = emb_dim
    params.cuda = cuda
    params.src_lang = lang
    params.tgt_lang = ''

    src_dico, _src_emb = load_embeddings(params, source=True)
    word2id = src_dico.word2id
    src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True)
    src_emb.weight.data.copy_(_src_emb)

    if params.cuda:
        src_emb.cuda()

    embeddings = src_emb.weight.data.cpu().numpy()
    word2id = src_dico.word2id

    return get_wordanalogy_scores_customfname(analogy_fname,
                                              lang,
                                              word2id,
                                              embeddings,
                                              lower=True)
예제 #2
0
 def setUpClass(cls):
     cls.toy_pickle1 = os.path.join('tests', 'test_pickles', "toy1.pickle")
     cls.toy_pickle2 = os.path.join('tests', 'test_pickles', "toy2.pickle")
     cls.embeddings, cls.word2index = load_embeddings(cls.toy_pickle1)
     cls.pt_analogy_path = os.path.join('src', 'analogies',
                                        "questions-words-ptbr.txt")
     cls.list_of_names = ["toy1", "toy2"]
     cls.list_of_pickles = [cls.toy_pickle1, cls.toy_pickle2]
     cls.judge = ModelJudge(cls.list_of_names, cls.list_of_pickles,
                            cls.pt_analogy_path)
     cls.best_model = cls.judge.get_best()
예제 #3
0
                    help="Maximum vocabulary size (-1 to disable)")
parser.add_argument("--emb_dim",
                    type=int,
                    default=300,
                    help="Embedding dimension")
parser.add_argument("--normalize_embeddings",
                    type=str,
                    default="",
                    help="Normalize embeddings before training")
parser.add_argument("--save_dico_path",
                    type=str,
                    default="./",
                    help="path to save trained dictionary")

# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)

src_dico, src_emb = load_embeddings(params, source=True, full_vocab=True)
tgt_dico, tgt_emb = load_embeddings(params, source=False, full_vocab=True)
params.src_dico = src_dico
params.tgt_dico = tgt_dico

# run dictioanry generation
build_S2T_dictionary_and_saved(src_emb, tgt_emb, params)
build_T2S_dictionary_and_saved(src_emb, tgt_emb, params)
예제 #4
0
from src.network import load_centrality

if __name__ == '__main__':
    dataset_name = sys.argv[1]
    embedding_size = int(sys.argv[2])
    #methods = ['deepwalk', 'line', 'node2vec', 'struc2vec', 'SVD', 'eniws', 'eni_6_1', 'eni_6_2', 'eni_7_1', 'eni_8_1', 'eni_9_1']
    #methods = ['deepwalk', 'line', 'node2vec', 'struc2vec', 'SVD', 'eniws', 'eni_1']
    #methods = ['deepwalk', 'line', 'node2vec', 'struc2vec']+['eni_{}_{}_{}_{}'.format(lr, embedding_size, alpha, lamb) for lr in [0.001, 0.0025, 0.005] for alpha in [0.0, 0.01, 0.1, 1.0] for lamb in [0.0, 0.01, 0.1, 1.0]]

    methods = ['graphsage']
    centrality_types = ['degree', 'closeness', 'betweenness', 'eigenvector', 'kcore']
    #centrality_types = ['spread_number']
    centrality_path = 'result/{}/data'.format(dataset_name)
    save_path = 'result/{}'.format(dataset_name)
    embedding_filenames = [os.path.join(save_path, "baseline_{}".format(embedding_size), 
        "{}.embeddings".format(m)) for m in methods if not m.startswith('eni_')]+\
         [os.path.join(save_path, "{}".format(m), 'embeddings.npy') for m in methods if m.startswith('eni_')]

    embedding_filenames = [os.path.join(save_path, "baseline_{}".format(embedding_size), "{}.npy".format(m)) for m in methods]

    embeddings = [load_embeddings(name) for name in embedding_filenames]
    centralities = [load_centrality(centrality_path, c) for c in centrality_types]
    res = np.zeros((len(methods), len(centrality_types)))
    for i in range(len(methods)):
        for j in range(len(centrality_types)):
            lr = LinearRegression(n_jobs=-1)
            y_pred = cross_val_predict(lr, embeddings[i][centralities[j][:, 0].astype(int)], centralities[j][:, 1])
            res[i, j] = MSE(y_pred, centralities[j][:, 1])/np.mean(centralities[j][:, 1])
            #res[i, j] = np.mean(abs((y_pred-centralities[j][:, 1])/(centralities[j][:, 1]+1e-10)))
    print_array(res)
예제 #5
0
            len(list(filter(lambda x: i < x, threshold))) - 1 for i in Y[:, 1]
        ])
    lb = preprocessing.LabelBinarizer()
    labels = lb.fit_transform(labels)

    save_path = 'result/{}'.format(dataset_name)
    embedding_filenames = [os.path.join(save_path, "baseline_{}".format(embedding_size),
        "{}.embeddings".format(m)) for m in methods if not m.startswith('eni_')]+\
        [os.path.join(save_path, "{}_{}".format(m, embedding_size), 'embeddings.npy') for m in methods if m.startswith('eni_')]
    embedding_filenames = [
        os.path.join(save_path, "baseline_{}".format(embedding_size),
                     "{}.npy".format(m)) for m in methods
    ]

    embeddings = [
        load_embeddings(name)[Y[:, 0].astype(int)]
        for name in embedding_filenames
    ]

    centrality_types = ['closeness', 'betweenness', 'eigenvector', 'kcore']
    centralities = [
        load_centrality(centrality_path, c)[Y[:, 0].astype(int),
                                            1].reshape(-1, 1)
        for c in centrality_types
    ]
    for c in centralities:
        c = c.reshape(-1, 1)
    #res = np.zeros((len(methods), len(centrality_types)))
    combine_centrality = np.hstack(centralities)
    centralities.append(combine_centrality)
    acc = []
예제 #6
0
    parser = argparse.ArgumentParser(description='Unsupervised training')
    parser.add_argument(
        "--src_emb",
        type=str,
        default='/home/mareike/PycharmProjects/breakit/embeddings/test_emb.vec'
    )
    parser.add_argument(
        "--tgt_emb",
        type=str,
        default='/home/mareike/PycharmProjects/breakit/embeddings/test_emb.vec'
    )
    parser.add_argument("--src_lang", type=str, default='en')
    parser.add_argument("--tgt_lang", type=str, default='en')
    parser.add_argument("--emb_dim", type=int, default=300)
    parser.add_argument("--max_vocab",
                        type=int,
                        default=200000,
                        help="Maximum vocabulary size (-1 to disable)")
    parser.add_argument("--cuda", type=bool, default=False)
    parser.add_argument("--normalize_embeddings", type=str, default="center")

    # parse parameters
    params = parser.parse_args()

    src_dico, src_emb = load_embeddings(params, source=True)
    trg_dico, trg_emb = load_embeddings(params, source=False)
    #src_emb = src_emb.numpy()
    #trg_emb = trg_emb.numpy()

    m = extract_initial_mapping(src_emb.numpy(), trg_emb.numpy())
예제 #7
0
evaluator = CaptionEvaluator(val_metric=args.val_metric)
logger.info('----------')

# Read word map
word_map_file = os.path.join(args.data_dir,
                             'WORDMAP_' + args.data_name + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)

# Initialize / load checkpoint
if not args.checkpoint_path:
    embed_layer = load_embeddings(args.word_embed,
                                  args.word_embed_dim,
                                  word_map,
                                  skip_first_line=True,
                                  rand_range=[-0.05, 0.05],
                                  fine_tune_embeds=args.fine_tune_embeds,
                                  pad_index=C.PAD_INDEX)

    encoder = CNNEncoder(
        proj_dim=args.decoder_hidden_size if not args.use_attention else None,
        feat_layer="conv" if args.use_attention else "fc",
        fine_tune=args.fine_tune_encoder)

    if args.decoder_cell_type == "vanilla":
        decoder = RNNDecoder(vocab_size=len(word_map),
                             embed_size=args.word_embed_dim,
                             encoder_state_size=encoder.output_size,
                             hidden_size=args.decoder_hidden_size,
                             decoder_out_dropout_prob=args.decoder_dropout,
예제 #8
0
    params = parser.parse_args()
    params.src_lang = 'src'
    params.tgt_lang = 'tgt'  # lang code can be arbitrary since we don't load dictionary files

    # check parameters
    assert os.path.isfile(params.src_emb)
    assert os.path.isfile(params.tgt_emb)

    # load input (to translate)
    print("loading input data...", file=sys.stderr)
    input_sents = load_input(params.input,
                             params.input_lowercase)  # CHECK: vocab?

    # load embeddings
    print("loading embeddings...", file=sys.stderr)
    src_dico, _src_emb = load_embeddings(
        params, source=True)  # 'dico' = word2id mappings
    src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True)
    src_emb.weight.data.copy_(_src_emb)

    tgt_dico, _tgt_emb = load_embeddings(params, source=False)
    tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True)
    tgt_emb.weight.data.copy_(_tgt_emb)

    if params.cuda:
        src_emb.cuda()
        tgt_emb.cuda()

    # normalize embeddings
    print("normalizing embeddings...", file=sys.stderr)
    params.src_mean = normalize_embeddings(src_emb.weight.data,
                                           params.normalize_embeddings)