Пример #1
0
def main():
    # データの読み込み
    df = helpers.load_data()
    df['body_wakati'] = df.body.apply(helpers.fetch_tokenize)

    # 入力データと正解ラベル生成
    X = df.body_wakati.values
    le = LabelEncoder()
    y = le.fit_transform(df.category)

    # Wikipedia2Vecの学習済モデル読み込み
    model = Wikipedia2Vec.load('models/jawiki_20180420_100d.pkl')

    # # パイプラインの構築とグリッドサーチ
    pipe = make_pipeline(Wiki2Vec(model=model),
                         SVC(random_state=0, probability=True))
    param_range = [0.1, 1, 10, 100]
    param_grid = [{
        'C': param_range,
        'kernel': 'linear'
    }, {
        'C': param_range,
        'gamma': param_range,
        'kernel': 'rbf'
    }]
    best_score, best_model = evaluator.grid_search(estimator=pipe,
                                                   params=param_grid,
                                                   X=X,
                                                   y=y)

    # スコアとモデルの保存
    save_dir = './models/wiki'
    helpers.mkdir(save_dir)
    np.savetxt(save_dir + '/accuracy.txt', np.array(best_score).reshape(1, 1))
    joblib.dump(best_model, save_dir + '/model.pkl')
Пример #2
0
def setup(wordnet_df):
    print("Start")
    PATH = './scripts/pretraining_data/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors'
    lexw2v = gensim.models.KeyedVectors.load_word2vec_format(PATH)
    print("Lexvec is completed")
    #siki2vwec
    wiki2vec = Wikipedia2Vec.load(
        "./scripts/pretraining_data/enwiki_20180420_300d.pkl")
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    print("Wiki2vec is completed")
    #wordnet
    print("Wordnet is completed")
    #mcg
    mgc_di = None
    with open('./scripts/pretraining_data/mcs_memo.pickle', 'rb') as file:
        mgc_di = pickle.load(file)
    print("Mcg is completed")

    LexVec.setup(lexw2v)
    Wiki2vec.setup(wiki2vec)
    MCG.setup(mgc_di)
    WordNet.setup(brown_ic, wordnet_df)
    BERT.setup()

    print("Setup sequence is all green")

    print("Proceed to creating feature sequence")
Пример #3
0
    def transform(self, X, y=None):  # returns a dataframe

        # X must be splitted in two parts : X_desc and X_pname
        X_desc, X_pname = X.iloc[:, 0], X.iloc[:, 1]

        # tranformation of X_pname into a custom BOW
        df_pname_trans, vec_fitted = self.__compute_doc_terms_df(\
                     ser_desc=X_pname,
                     preproc_func=self.preproc_func,
                     preproc_func_params=self.preproc_func_params,
                     vec_params=self.vec_params,
                     tfidf_on=self.tfidf_on,
                     vec=None) # vec not fitted yet

        # tranformation of X_desc into a custom BOW (vec fitted with desc)
        df_desc_trans, _ = self.__compute_doc_terms_df(\
                     ser_desc=X_desc,
                     preproc_func=self.preproc_func,
                     preproc_func_params=self.preproc_func_params,
                     vec=vec_fitted) # vec fitted on the descriptions

        # Mix the X_desc and X_pname BOWs into one BOW (weight)

        df_trans = (df_desc_trans.mul(1-self.pname_weight,
                                      fill_value=0))\
                    .add(df_pname_trans.mul(self.pname_weight,
                                            fill_value=0),
                        fill_value=0)
        # if word_embedding is enabled, projection of the BOW on a given w2v
        if self.w2v:
            wiki2vec = Wikipedia2Vec.load(self.path_wiki2vec)
            df_trans = proj_term_doc_on_w2v(df_trans,
                                            wiki2vec,
                                            print_opt=False)
        return df_trans
Пример #4
0
def prepare(params, samples):
    # Load model
    if not os.path.exists(PATH_TO_MODEL):
        raise Exception("There are no pretrained model in \"" + PATH_TO_MODEL +
                        "\"")

    params.model = Wikipedia2Vec.load(PATH_TO_MODEL)
    return
Пример #5
0
def main():
    argvs = sys.argv
    argc = len(argvs)

    MODEL_FILE = argvs[1]
    OUTPUT_FILE = argvs[2]
    wiki2vec = Wikipedia2Vec.load(MODEL_FILE)
    save_text(wiki2vec, OUTPUT_FILE)
Пример #6
0
def _load_wikipedia2vec(
        wiki_model_path='data/external/enwiki_20180420_100d.pkl'):
    path = os.path.join(HOME_DIR, wiki_model_path)
    if os.path.exists(path):
        return Wikipedia2Vec.load(path)
    else:
        logger.warn('No pretrained Wikipedia2Vec found.')
        return None
def compute_embedding(model_name, word):
    if 'wiki' in model_name:
        model = Wikipedia2Vec.load(model_name)
        return model.get_word_vector(word)
    elif 'google' in model_name:
        model = gensim.models.KeyedVectors.load_word2vec_format(model_name,
                                                                binary=True)
        return model[word]
def get_processed_test(ent_path, sent_path, f_ent, f_sent):
    with open(join(ent_path, f_ent), 'rb') as fen:
        enl = pickle.load(fen)
    with open(join(sent_path, f_sent), 'rb') as fsent:
        sentence = pickle.load(fsent)
    data = []
    wiki2vec = Wikipedia2Vec.load(MODEL_FILE)
    nu = np.full((1, 300), -1.0).flatten()
    URL = []
    for URI in enl:
        num = len(URI)
        U = []
        if num == 0:
            URL.append([])
            continue
        if num == 1:
            URL.append([URI[0]['URI']])
            continue
        else:
            for i in range(num):
                U.append(URI[i]['URI'])
            U = set(U)
            URL.append(U)

    print(len(URL), len(sentence))
    for URI, s, n in zip(URL, sentence, range(0, 500000)):
        # print(df['line_number'][i])
        num = len(URI)
        URI = list(URI)
        if num == 0:
            data.append(['NA', 'NA', nu, nu, -1, s, n])
            print(data[-1][-1])
            continue
        if num == 1:
            en1_name = URI[0].split('/')[-1].replace('_', ' ')
            try:
                en1 = wiki2vec.get_entity_vector(en1_name)
            except:
                en1 = nu
            data.append([en1_name, 'NA', en1, nu, -1, s, n])
            print(data[-1][-1])
            continue
        for i in range(num):
            en1_name = URI[i].split('/')[-1].replace('_', ' ')
            try:
                en1 = wiki2vec.get_entity_vector(en1_name)
            except:
                en1 = nu
            for j in range(i + 1, num):
                en2_name = URI[j].split('/')[-1].replace('_', ' ')
                try:
                    en2 = wiki2vec.get_entity_vector(en2_name)
                except:
                    en2 = nu
                data.append([en1_name, en2_name, en1, en2, -1, s, n])
                print(data[-1][-1])
    # print(len(data))
    return data
    def __init__(self):
        if torch.cuda.is_available():
            torch.cuda.manual_seed(123)
            self.device = torch.device("cuda")
        else:
            torch.manual_seed(123)
            self.device = torch.device("cpu")

        self.model = Wikipedia2Vec.load(MODEL_FILE)
Пример #10
0
    def __init__(self,
                 path,
                 prefix="ENTITY/",
                 do_cache_dict=True,
                 do_lower_case=False):
        from wikipedia2vec import Wikipedia2Vec, Dictionary
        if os.path.exists(path):
            self.model = Wikipedia2Vec.load(path)
        elif os.path.exists(os.path.join(RESOURCE_DIR, "wikipedia2vec", path)):
            self.model = Wikipedia2Vec.load(
                os.path.join(RESOURCE_DIR, "wikipedia2vec", path))
        else:
            raise Exception()

        self.dict_cache = None
        if do_cache_dict:
            self.dict_cache = {}

        self.prefix = prefix
        self.do_lower_case = do_lower_case

        assert self.prefix + "San_Francisco" in self
        assert self.prefix + "St_Linus" in self
Пример #11
0
def main(model_file, tensor_file, metadata_file, config_file, model_name,
         base_url, word_size, entity_size):
    model = Wikipedia2Vec.load(model_file)
    words = [
        w for w in sorted(model.dictionary.words(),
                          key=lambda w: w.count,
                          reverse=True)[:word_size]
    ]
    entities = [
        e for e in sorted(model.dictionary.entities(),
                          key=lambda w: w.count,
                          reverse=True)[:entity_size]
    ]

    with open(tensor_file, mode='w', encoding='utf-8') as ten:
        with open(metadata_file, mode='w', encoding='utf-8') as meta:
            meta.write('item\ttype\tcount\n')
            for word in words:
                if re.match(r"^\s*$", word.text):
                    continue
                vector_str = '\t'.join(
                    ['%.5f' % v for v in model.get_vector(word)])
                ten.write(vector_str + '\n')
                meta.write('WORD/%s\tword\t%d\n' % (word.text, word.count))

            for entity in entities:
                vector_str = '\t'.join(
                    ['%.5f' % v for v in model.get_vector(entity)])
                ten.write(vector_str + '\n')
                meta.write('ENT/%s\tentity\t%d\n' %
                           (entity.title, entity.count))

    if model_name is None:
        model_name = languages.get(alpha2=model.dictionary.language).name

    config_obj = {
        'embeddings': [{
            "tensorName":
            model_name,
            'tensorShape': [word_size + entity_size, model.syn0.shape[1]],
            "tensorPath":
            base_url + tensor_file,
            "metadataPath":
            base_url + metadata_file
        }]
    }

    with open(config_file, mode='w', encoding='utf-8') as f:
        json.dump(config_obj, f, indent=2, sort_keys=True)
Пример #12
0
def train_classifier(wikipedia2vec_file, entity_linker_file, dataset,
                     dataset_path, dev_size, **kwargs):
    if dataset == '20ng':
        data = load_20ng_dataset(dev_size)
    else:
        data = load_r8_dataset(dataset_path, dev_size)

    for key, value in DEFAULT_HYPER_PARAMS[dataset].items():
        if kwargs[key] is None:
            kwargs[key] = value

    tokenizer = RegexpTokenizer()
    entity_linker = EntityLinker(entity_linker_file)
    embedding = Wikipedia2Vec.load(wikipedia2vec_file)

    return train(data, embedding, tokenizer, entity_linker, **kwargs)
Пример #13
0
    def word_embedder(self):
        """Sets up the Wikipedia2Vec model from the default file used by this
        application.

        Returns:
            Wikipedia2Vec: A Wikipedia2Vec Model
        """
        loc = self.model_loc(self.WIKIPEDIA_2_VEC_MODEL_NAME)
        logger.info(f'Loading Wikipedia2Vec word embeddings model from {loc}.')
        model = Wikipedia2Vec.load(loc)
        logger.debug('Model loaded successfully.')
        logger.debug('Extracting dimension from filename.')
        dim = int(re.search('.*_(\d*)d\.',
                            self.WIKIPEDIA_2_VEC_MODEL_NAME).group(1))
        self.wordvec_dim = dim
        logger.debug(f'Assuming dimension {dim} for {loc}.')

        return model
Пример #14
0
def load_wiki2vec():
  try:
    wiki2vec = Wikipedia2Vec.load('./enwiki_20180420_500d.pkl')
  except:
    wiki2vec = Wikipedia2Vec.load('./enwiki_20180420_100d.pkl')
  return Wiki2Vec(wiki2vec, torch.device('cpu'))
Пример #15
0
    def __init__(self, path: str):
        super().__init__()
        self.__path: str = path

        self.set_model(Wikipedia2Vec.load(self.__path))
Пример #16
0
import os, re
import pickle
#import gensim
import time, random
from wikipedia2vec import Wikipedia2Vec

vec_count = 50
#fkk = open('id2title.pickle', 'rb')
#id2title = pickle.load(fkk)

'''
fkk = open('id2id.pickle', 'rb')
id2id = pickle.load(fkk)
'''

wiki2vec = Wikipedia2Vec.load('enwiki-20190420-50d.pkl')

#model = gensim.models.Word2Vec.load("gensim_100_model")
#print(type(model))

counter,count_valid = 0,0
	
#all_array = np.zeros(vec_count)

mmap = dict()

beg = time.time()

pre_emb = np.load('ent_embeddings_50.npy')
# fillin by random
Пример #17
0
 def __init__(self):
     if path.exists('wiki_tagger.pkl'):
         print('model already made')
         self.model = Wikipedia2Vec.load('wiki_tagger.pkl')
     else:
         print('no model found')
Пример #18
0
 def load_model(self):
     try:
         return Wikipedia2Vec.load(self.reference)
     except (FileNotFoundError, KeyError):
         raise FileNotFoundError
Пример #19
0
            "relation", "e1_pos_begin", "e1_pos_end", "e2_pos_begin",
            "e2_pos_end"
    ]:
        df[column] = pd.to_numeric(df[column])
    df["label"] = get_label(df["relation"].tolist(), len(Relations))
    df["len"] = df["words"].map(lambda x: len(x))
    df["words"] = df["words"].map(lambda x: extend(x, ["."]))
    df["words"] = df["words"].map(lambda x: x[:FIXED_SIZE])
    df["words"] = df["words"].map(
        lambda x: extend(x, ["BLANK" for _ in range(FIXED_SIZE - len(x))]))
    df["e1"] = df["e1_pos_end"]
    df["e2"] = df["e2_pos_end"]

    # 决定使用wiki百科语料训练的词向量来进行句子表示
    os.chdir("/home/zy/data/wiki_win10_300d_20180420")
    wiki_model = Wikipedia2Vec.load("enwiki_20180420_win10_300d.pkl")

    all_words = set()

    for i in range(len(df)):
        words = set(df["words"][i])
        all_words = all_words.union(words)

    word_to_index = {}
    vec_list = []
    index = 0
    unrecord_word_cnt = 0
    for word in all_words:
        if word == "BLANK":
            vec_list.append(np.zeros(shape=(EMBEDDING_DIM, ), dtype="float32"))
            word_to_index[word] = index
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import csv

import scipy
from scipy import stats


def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


wiki2vec = Wikipedia2Vec.load('../enwiki_20180420_100d.pkl')

N = 72

A = np.empty(N)
B = np.empty(N)
C = np.empty(N)
D = np.empty(N)
E = np.empty(N)
F = np.empty(N)

vectors = np.empty([N, 100])

f = open('metusalem2012_experiment.csv', 'w')
writer = csv.writer(f, lineterminator='\n')
Пример #21
0
def wikipedia2VecDemo():
    with open('enwiki_20180420_100d.pkl.bz2', 'rb') as MODEL_FILE:
        model = Wikipedia2Vec.load(MODEL_FILE)
        print(model.get_entity_vector('Scarlett Johansson'))
Пример #22
0
from wikipedia2vec import Wikipedia2Vec
model = Wikipedia2Vec.load('wiki_tagger.pkl')
newtag = input('input tag')
try:
    model.get_word_vector(newtag.lower())
except:
    try:
        model.get_entity_vector(newtag)
    except:
        try:
            model.get_entity_vector(newtag.lower())
        except:
            print('adding failed')
            quit()
if newtag not in open('app/scraping/mldata/tags.txt','r').readlines():
    with open('app/scraping/mldata/tags.txt','a') as f:
        f.write('\n'+newtag)
else:
    print('tag already exists')
 def __init__(self, model_path):
     self.model = Wikipedia2Vec.load(model_path)
     self.vector_size = self.model.train_params["dim_size"]
     print('Model loaded')
Пример #24
0
 def __init__(self, entity_filename=config.out_ent_filename):
     self.wiki2vec = Wikipedia2Vec.load(config.wiki_model_file)
     self.entity_filename = entity_filename
Пример #25
0
        os.path.join(args.wiki_preprocess, 'entity_vocab.txt'))
    print(f"# word in dataset: {len(word_vocab)}")
    print(f"# entity in dataset: {len(entity_vocab)}")

    path = os.path.join(args.wiki_preprocess, 'inlinks.txt')
    with open(path, 'r') as f:
        for line in tqdm(f,
                         leave=False,
                         dynamic_ncols=True,
                         desc="Read inlniks"):
            links = json.loads(line)
            for word in links['inlinks']:
                word_vocab.add(word)
    print(f"# word in dataset + inlinks: {len(word_vocab)}")

    wiki2vec = Wikipedia2Vec.load(args.wiki2vec)
    inwiki_words_num = 0
    word_vecs = []
    word_vocab_path = os.path.join(args.wiki_preprocess, 'word_vocab.txt')
    with open(word_vocab_path, 'w') as f:
        for word in tqdm(sorted(list(word_vocab)),
                         leave=False,
                         dynamic_ncols=True,
                         desc="Filter in-wiki words"):
            try:
                vec = wiki2vec.get_word_vector(word)
                word_vecs.append(vec)
                f.write(word + "\n")
                inwiki_words_num += 1
            except KeyError:
                pass
Пример #26
0
from wikipedia2vec import Wikipedia2Vec
import unidic_lite
import MeCab

tagger = MeCab.Tagger(unidic_lite.DICDIR)
MODEL_FILE = '/usr/src/taku/jawiki_20180420_300d.pkl'
wiki2vec = Wikipedia2Vec.load(MODEL_FILE)


def sentence_to_words(s, max_len=512):
    # print(s)
    words = []
    node = tagger.parseToNode(s)
    while node is not None:
        #if node.feature.startswith('名詞'):
        try:
            words.append(node.surface)
        except:
            #print("Caught it!")
            pass
        node = node.next
    return [word for word in words if len(word) > 0][:max_len]


def sentence_to_wordvecs(s, max_len=512, require_words=False):
    words = sentence_to_words(s, max_len)
    vecs = []
    words_per_sentence = []
    for word in words:
        current_length = len(vecs)
        try:
Пример #27
0
    headers = {"User-Agent": "My-Bot-Name/1.0"}
    req = requests.get(API_URL, headers=headers, params=params)
    res = req.json()
    revision = res["query"]["pages"][0]["revisions"][0]
    text = revision["slots"]["main"]["content"]
    return mwparserfromhell.parse(text)


anchors = pickle.load(open("./data/{0}/{0}.anchors.pkl".format(lang), "rb"))
pageids = pickle.load(open("./data/{0}/{0}.pageids.pkl".format(lang), "rb"))
redirects = pickle.load(open("./data/{0}/{0}.redirects.pkl".format(lang),
                             "rb"))

from wikipedia2vec import Wikipedia2Vec
w2file = './data/{0}/{0}.w2v.bin'.format(lang)
word2vec = Wikipedia2Vec.load(w2file)

import fasttext
navfile = './data/{0}/{0}.nav.bin'.format(lang)
nav2vec = fasttext.load_model(navfile)

import xgboost as xgb
model = xgb.XGBClassifier()  # init model
model.load_model('./data/{0}/0001.link.bin'.format(lang))  # load data

app = Flask(__name__)
app.config["DEBUG"] = True
app.config['JSON_SORT_KEYS'] = False
CUSTOM_UA = 'reader session app -- [email protected]'

THRESHOLD = 0.95
Пример #28
0
import pickle
#import gensim
import time
from wikipedia2vec import Wikipedia2Vec

#files = os.listdir('./rel')

vec_count = 100
fkk = open('id2title.pickle', 'rb')
id2title = pickle.load(fkk)
'''
fkk = open('id2id.pickle', 'rb')
id2id = pickle.load(fkk)
'''

wiki2vec = Wikipedia2Vec.load('enwiki-20190420-50d-disambi.pkl')

#model = gensim.models.Word2Vec.load("gensim_100_model")
#print(type(model))

counter, count_valid = 0, 0

#all_array = np.zeros(vec_count)

fg = open('bad_fb', 'w')
mmap = dict()

beg = time.time()

pre_emb = np.load('ent_embeddings_50.npy')
Пример #29
0
def model_gen():
    model = Wikipedia2Vec.load(
        '/home/seanammirati/dev/audio_visual_gen/embeddings/enwiki_20180420_100d.pkl'
    )

    return model
Пример #30
0
def main(data_dir, model_file, out_format, word_analogy, word_similarity,
         entity_similarity, lowercase, batch_size, analogy_vocab_size):
    model = Wikipedia2Vec.load(model_file)

    results = []

    if word_similarity:
        base_dir = os.path.join(os.path.join(data_dir, 'word'), 'similarity')
        for filename in os.listdir(base_dir):
            if not filename.endswith('.txt'):
                continue

            oov_count = 0
            with open(os.path.join(base_dir, filename)) as f:
                gold = []
                estimated = []
                for line in f:
                    (w1, w2, val) = line.split()
                    val = float(val)
                    if lowercase:
                        (w1, w2) = (w1.lower(), w2.lower())
                    try:
                        v1 = model.get_word_vector(w1)
                    except KeyError:
                        oov_count += 1
                        continue
                    try:
                        v2 = model.get_word_vector(w2)
                    except KeyError:
                        oov_count += 1
                        continue

                    gold.append(val)
                    estimated.append(1.0 - cosine(v1, v2))

                results.append(
                    (filename[:-4], spearmanr(gold, estimated)[0], oov_count))

    if word_analogy:
        if analogy_vocab_size is None:
            target_words = [w.text for w in model.dictionary.words()]
        else:
            target_words = [
                w.text for w in sorted(model.dictionary.words(),
                                       key=lambda w: w.count,
                                       reverse=True)[:analogy_vocab_size]
            ]

        word_emb = np.empty((len(target_words), model.syn0.shape[1]))
        vocab = {}
        for (n, word) in enumerate(target_words):
            word_emb[n] = model.get_word_vector(word)
            vocab[word] = n
        word_emb = word_emb / np.linalg.norm(
            word_emb, 2, axis=1, keepdims=True)

        base_dir = os.path.join(os.path.join(data_dir, 'word'), 'analogy')
        for filename in os.listdir(base_dir):
            with open(os.path.join(base_dir, filename)) as f:
                (A_ind, B_ind, C_ind, D_ind) = ([], [], [], [])
                oov_count = 0
                for (n, line) in enumerate(f):
                    if not line.startswith(':'):
                        if lowercase:
                            indices = list(map(vocab.get,
                                               line.lower().split()))
                        else:
                            indices = list(map(vocab.get, line.split()))
                        if not all(i is not None for i in indices):
                            oov_count += 1
                            continue

                        (a_ind, b_ind, c_ind, d_ind) = indices
                        A_ind.append(a_ind)
                        B_ind.append(b_ind)
                        C_ind.append(c_ind)
                        D_ind.append(d_ind)

                (A, B, C) = (word_emb[A_ind], word_emb[B_ind], word_emb[C_ind])
                D = (B - A + C)
                del A, B, C

                predictions = []

                for i in trange(0, D.shape[0], batch_size, desc=filename[:-4]):
                    D_batch = D[i:i + batch_size]
                    dot_ret = np.dot(word_emb, D_batch.T)
                    for (j, indices) in enumerate(
                            zip(A_ind[i:i + batch_size],
                                B_ind[i:i + batch_size],
                                C_ind[i:i + batch_size])):
                        dot_ret[indices, j] = float('-inf')
                    predictions.append(np.argmax(dot_ret, 0))

                results.append(
                    (filename[:-4], np.mean(np.hstack(predictions) == D_ind),
                     oov_count))

    if entity_similarity:
        category_mapping = {
            e: c
            for (c, l) in KORE_CATEGORIES.items() for e in l
        }

        base_dir = os.path.join(os.path.join(data_dir, 'entity'), 'similarity')
        for filename in os.listdir(base_dir):
            with open(os.path.join(base_dir, filename)) as f:
                if filename == 'KORE.txt':
                    data = defaultdict(list)
                    title = None
                    for line in f:
                        line = line.rstrip()
                        if line.startswith('\t'):
                            data[title].append(line[1:])
                        else:
                            title = line

                    kore_results = defaultdict(list)
                    oov_count = 0
                    for (title, title_list) in data.items():
                        try:
                            v1 = model.get_entity_vector(title)
                        except KeyError:
                            oov_count += len(title_list)
                            continue

                        estimated = []
                        for title2 in title_list:
                            try:
                                v2 = model.get_entity_vector(title2)
                            except KeyError:
                                oov_count += 1
                                continue
                            estimated.append(1.0 - cosine(v1, v2))

                        gold = list(reversed(range(len(estimated))))
                        kore_results[category_mapping[title]].append(
                            spearmanr(gold, estimated)[0])

                    results.append(
                        (filename[:-4],
                         np.mean(list(chain(*kore_results.values()))),
                         oov_count))

                else:
                    gold = []
                    estimated = []
                    oov_count = 0
                    for (n, line) in enumerate(f):
                        if n == 0:
                            continue
                        line = line.rstrip()
                        (_, _, title1, _, _, title2, score) = line.split('\t')

                        try:
                            v1 = model.get_entity_vector(
                                title1.replace('_', ' '))
                        except KeyError:
                            oov_count += 1
                            continue
                        try:
                            v2 = model.get_entity_vector(
                                title2.replace('_', ' '))
                        except KeyError:
                            oov_count += 1
                            continue

                        gold.append(float(score))
                        estimated.append(1.0 - cosine(v1, v2))

                    results.append(
                        (filename[:-4], spearmanr(gold,
                                                  estimated)[0], oov_count))

    if out_format == 'text':
        for (name, score, oov_count) in results:
            print('%s: ' % name)
            print('  Spearman score: %.4f' % score)
            print('  OOV instances: %d' % oov_count)

    elif out_format == 'csv':
        print('name,' + ','.join([o[0] for o in results]))
        print('score,' + ','.join(['%.4f' % o[1] for o in results]))
        print('oov,' + ','.join(['%d' % o[2] for o in results]))