def score_analogy(analogy_fname, embeddings_path, lang, emb_dim, max_vocab=200000, lower=True, cuda=True): # source embeddings params = Params() params.src_emb = embeddings_path params.tgt_emb = '' params.max_vocab = max_vocab params.emb_dim = emb_dim params.cuda = cuda params.src_lang = lang params.tgt_lang = '' src_dico, _src_emb = load_embeddings(params, source=True) word2id = src_dico.word2id src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True) src_emb.weight.data.copy_(_src_emb) if params.cuda: src_emb.cuda() embeddings = src_emb.weight.data.cpu().numpy() word2id = src_dico.word2id return get_wordanalogy_scores_customfname(analogy_fname, lang, word2id, embeddings, lower=True)
def setUpClass(cls): cls.toy_pickle1 = os.path.join('tests', 'test_pickles', "toy1.pickle") cls.toy_pickle2 = os.path.join('tests', 'test_pickles', "toy2.pickle") cls.embeddings, cls.word2index = load_embeddings(cls.toy_pickle1) cls.pt_analogy_path = os.path.join('src', 'analogies', "questions-words-ptbr.txt") cls.list_of_names = ["toy1", "toy2"] cls.list_of_pickles = [cls.toy_pickle1, cls.toy_pickle2] cls.judge = ModelJudge(cls.list_of_names, cls.list_of_pickles, cls.pt_analogy_path) cls.best_model = cls.judge.get_best()
help="Maximum vocabulary size (-1 to disable)") parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") parser.add_argument("--save_dico_path", type=str, default="./", help="path to save trained dictionary") # parse parameters params = parser.parse_args() # check parameters assert params.src_lang, "source language undefined" assert os.path.isfile(params.src_emb) assert not params.tgt_lang or os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) src_dico, src_emb = load_embeddings(params, source=True, full_vocab=True) tgt_dico, tgt_emb = load_embeddings(params, source=False, full_vocab=True) params.src_dico = src_dico params.tgt_dico = tgt_dico # run dictioanry generation build_S2T_dictionary_and_saved(src_emb, tgt_emb, params) build_T2S_dictionary_and_saved(src_emb, tgt_emb, params)
from src.network import load_centrality if __name__ == '__main__': dataset_name = sys.argv[1] embedding_size = int(sys.argv[2]) #methods = ['deepwalk', 'line', 'node2vec', 'struc2vec', 'SVD', 'eniws', 'eni_6_1', 'eni_6_2', 'eni_7_1', 'eni_8_1', 'eni_9_1'] #methods = ['deepwalk', 'line', 'node2vec', 'struc2vec', 'SVD', 'eniws', 'eni_1'] #methods = ['deepwalk', 'line', 'node2vec', 'struc2vec']+['eni_{}_{}_{}_{}'.format(lr, embedding_size, alpha, lamb) for lr in [0.001, 0.0025, 0.005] for alpha in [0.0, 0.01, 0.1, 1.0] for lamb in [0.0, 0.01, 0.1, 1.0]] methods = ['graphsage'] centrality_types = ['degree', 'closeness', 'betweenness', 'eigenvector', 'kcore'] #centrality_types = ['spread_number'] centrality_path = 'result/{}/data'.format(dataset_name) save_path = 'result/{}'.format(dataset_name) embedding_filenames = [os.path.join(save_path, "baseline_{}".format(embedding_size), "{}.embeddings".format(m)) for m in methods if not m.startswith('eni_')]+\ [os.path.join(save_path, "{}".format(m), 'embeddings.npy') for m in methods if m.startswith('eni_')] embedding_filenames = [os.path.join(save_path, "baseline_{}".format(embedding_size), "{}.npy".format(m)) for m in methods] embeddings = [load_embeddings(name) for name in embedding_filenames] centralities = [load_centrality(centrality_path, c) for c in centrality_types] res = np.zeros((len(methods), len(centrality_types))) for i in range(len(methods)): for j in range(len(centrality_types)): lr = LinearRegression(n_jobs=-1) y_pred = cross_val_predict(lr, embeddings[i][centralities[j][:, 0].astype(int)], centralities[j][:, 1]) res[i, j] = MSE(y_pred, centralities[j][:, 1])/np.mean(centralities[j][:, 1]) #res[i, j] = np.mean(abs((y_pred-centralities[j][:, 1])/(centralities[j][:, 1]+1e-10))) print_array(res)
len(list(filter(lambda x: i < x, threshold))) - 1 for i in Y[:, 1] ]) lb = preprocessing.LabelBinarizer() labels = lb.fit_transform(labels) save_path = 'result/{}'.format(dataset_name) embedding_filenames = [os.path.join(save_path, "baseline_{}".format(embedding_size), "{}.embeddings".format(m)) for m in methods if not m.startswith('eni_')]+\ [os.path.join(save_path, "{}_{}".format(m, embedding_size), 'embeddings.npy') for m in methods if m.startswith('eni_')] embedding_filenames = [ os.path.join(save_path, "baseline_{}".format(embedding_size), "{}.npy".format(m)) for m in methods ] embeddings = [ load_embeddings(name)[Y[:, 0].astype(int)] for name in embedding_filenames ] centrality_types = ['closeness', 'betweenness', 'eigenvector', 'kcore'] centralities = [ load_centrality(centrality_path, c)[Y[:, 0].astype(int), 1].reshape(-1, 1) for c in centrality_types ] for c in centralities: c = c.reshape(-1, 1) #res = np.zeros((len(methods), len(centrality_types))) combine_centrality = np.hstack(centralities) centralities.append(combine_centrality) acc = []
parser = argparse.ArgumentParser(description='Unsupervised training') parser.add_argument( "--src_emb", type=str, default='/home/mareike/PycharmProjects/breakit/embeddings/test_emb.vec' ) parser.add_argument( "--tgt_emb", type=str, default='/home/mareike/PycharmProjects/breakit/embeddings/test_emb.vec' ) parser.add_argument("--src_lang", type=str, default='en') parser.add_argument("--tgt_lang", type=str, default='en') parser.add_argument("--emb_dim", type=int, default=300) parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") parser.add_argument("--cuda", type=bool, default=False) parser.add_argument("--normalize_embeddings", type=str, default="center") # parse parameters params = parser.parse_args() src_dico, src_emb = load_embeddings(params, source=True) trg_dico, trg_emb = load_embeddings(params, source=False) #src_emb = src_emb.numpy() #trg_emb = trg_emb.numpy() m = extract_initial_mapping(src_emb.numpy(), trg_emb.numpy())
evaluator = CaptionEvaluator(val_metric=args.val_metric) logger.info('----------') # Read word map word_map_file = os.path.join(args.data_dir, 'WORDMAP_' + args.data_name + '.json') with open(word_map_file, 'r') as j: word_map = json.load(j) # Initialize / load checkpoint if not args.checkpoint_path: embed_layer = load_embeddings(args.word_embed, args.word_embed_dim, word_map, skip_first_line=True, rand_range=[-0.05, 0.05], fine_tune_embeds=args.fine_tune_embeds, pad_index=C.PAD_INDEX) encoder = CNNEncoder( proj_dim=args.decoder_hidden_size if not args.use_attention else None, feat_layer="conv" if args.use_attention else "fc", fine_tune=args.fine_tune_encoder) if args.decoder_cell_type == "vanilla": decoder = RNNDecoder(vocab_size=len(word_map), embed_size=args.word_embed_dim, encoder_state_size=encoder.output_size, hidden_size=args.decoder_hidden_size, decoder_out_dropout_prob=args.decoder_dropout,
params = parser.parse_args() params.src_lang = 'src' params.tgt_lang = 'tgt' # lang code can be arbitrary since we don't load dictionary files # check parameters assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) # load input (to translate) print("loading input data...", file=sys.stderr) input_sents = load_input(params.input, params.input_lowercase) # CHECK: vocab? # load embeddings print("loading embeddings...", file=sys.stderr) src_dico, _src_emb = load_embeddings( params, source=True) # 'dico' = word2id mappings src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True) src_emb.weight.data.copy_(_src_emb) tgt_dico, _tgt_emb = load_embeddings(params, source=False) tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True) tgt_emb.weight.data.copy_(_tgt_emb) if params.cuda: src_emb.cuda() tgt_emb.cuda() # normalize embeddings print("normalizing embeddings...", file=sys.stderr) params.src_mean = normalize_embeddings(src_emb.weight.data, params.normalize_embeddings)