def main(): # データの読み込み df = helpers.load_data() df['body_wakati'] = df.body.apply(helpers.fetch_tokenize) # 入力データと正解ラベル生成 X = df.body_wakati.values le = LabelEncoder() y = le.fit_transform(df.category) # Wikipedia2Vecの学習済モデル読み込み model = Wikipedia2Vec.load('models/jawiki_20180420_100d.pkl') # # パイプラインの構築とグリッドサーチ pipe = make_pipeline(Wiki2Vec(model=model), SVC(random_state=0, probability=True)) param_range = [0.1, 1, 10, 100] param_grid = [{ 'C': param_range, 'kernel': 'linear' }, { 'C': param_range, 'gamma': param_range, 'kernel': 'rbf' }] best_score, best_model = evaluator.grid_search(estimator=pipe, params=param_grid, X=X, y=y) # スコアとモデルの保存 save_dir = './models/wiki' helpers.mkdir(save_dir) np.savetxt(save_dir + '/accuracy.txt', np.array(best_score).reshape(1, 1)) joblib.dump(best_model, save_dir + '/model.pkl')
def setup(wordnet_df): print("Start") PATH = './scripts/pretraining_data/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors' lexw2v = gensim.models.KeyedVectors.load_word2vec_format(PATH) print("Lexvec is completed") #siki2vwec wiki2vec = Wikipedia2Vec.load( "./scripts/pretraining_data/enwiki_20180420_300d.pkl") brown_ic = wordnet_ic.ic('ic-brown.dat') print("Wiki2vec is completed") #wordnet print("Wordnet is completed") #mcg mgc_di = None with open('./scripts/pretraining_data/mcs_memo.pickle', 'rb') as file: mgc_di = pickle.load(file) print("Mcg is completed") LexVec.setup(lexw2v) Wiki2vec.setup(wiki2vec) MCG.setup(mgc_di) WordNet.setup(brown_ic, wordnet_df) BERT.setup() print("Setup sequence is all green") print("Proceed to creating feature sequence")
def transform(self, X, y=None): # returns a dataframe # X must be splitted in two parts : X_desc and X_pname X_desc, X_pname = X.iloc[:, 0], X.iloc[:, 1] # tranformation of X_pname into a custom BOW df_pname_trans, vec_fitted = self.__compute_doc_terms_df(\ ser_desc=X_pname, preproc_func=self.preproc_func, preproc_func_params=self.preproc_func_params, vec_params=self.vec_params, tfidf_on=self.tfidf_on, vec=None) # vec not fitted yet # tranformation of X_desc into a custom BOW (vec fitted with desc) df_desc_trans, _ = self.__compute_doc_terms_df(\ ser_desc=X_desc, preproc_func=self.preproc_func, preproc_func_params=self.preproc_func_params, vec=vec_fitted) # vec fitted on the descriptions # Mix the X_desc and X_pname BOWs into one BOW (weight) df_trans = (df_desc_trans.mul(1-self.pname_weight, fill_value=0))\ .add(df_pname_trans.mul(self.pname_weight, fill_value=0), fill_value=0) # if word_embedding is enabled, projection of the BOW on a given w2v if self.w2v: wiki2vec = Wikipedia2Vec.load(self.path_wiki2vec) df_trans = proj_term_doc_on_w2v(df_trans, wiki2vec, print_opt=False) return df_trans
def prepare(params, samples): # Load model if not os.path.exists(PATH_TO_MODEL): raise Exception("There are no pretrained model in \"" + PATH_TO_MODEL + "\"") params.model = Wikipedia2Vec.load(PATH_TO_MODEL) return
def main(): argvs = sys.argv argc = len(argvs) MODEL_FILE = argvs[1] OUTPUT_FILE = argvs[2] wiki2vec = Wikipedia2Vec.load(MODEL_FILE) save_text(wiki2vec, OUTPUT_FILE)
def _load_wikipedia2vec( wiki_model_path='data/external/enwiki_20180420_100d.pkl'): path = os.path.join(HOME_DIR, wiki_model_path) if os.path.exists(path): return Wikipedia2Vec.load(path) else: logger.warn('No pretrained Wikipedia2Vec found.') return None
def compute_embedding(model_name, word): if 'wiki' in model_name: model = Wikipedia2Vec.load(model_name) return model.get_word_vector(word) elif 'google' in model_name: model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=True) return model[word]
def get_processed_test(ent_path, sent_path, f_ent, f_sent): with open(join(ent_path, f_ent), 'rb') as fen: enl = pickle.load(fen) with open(join(sent_path, f_sent), 'rb') as fsent: sentence = pickle.load(fsent) data = [] wiki2vec = Wikipedia2Vec.load(MODEL_FILE) nu = np.full((1, 300), -1.0).flatten() URL = [] for URI in enl: num = len(URI) U = [] if num == 0: URL.append([]) continue if num == 1: URL.append([URI[0]['URI']]) continue else: for i in range(num): U.append(URI[i]['URI']) U = set(U) URL.append(U) print(len(URL), len(sentence)) for URI, s, n in zip(URL, sentence, range(0, 500000)): # print(df['line_number'][i]) num = len(URI) URI = list(URI) if num == 0: data.append(['NA', 'NA', nu, nu, -1, s, n]) print(data[-1][-1]) continue if num == 1: en1_name = URI[0].split('/')[-1].replace('_', ' ') try: en1 = wiki2vec.get_entity_vector(en1_name) except: en1 = nu data.append([en1_name, 'NA', en1, nu, -1, s, n]) print(data[-1][-1]) continue for i in range(num): en1_name = URI[i].split('/')[-1].replace('_', ' ') try: en1 = wiki2vec.get_entity_vector(en1_name) except: en1 = nu for j in range(i + 1, num): en2_name = URI[j].split('/')[-1].replace('_', ' ') try: en2 = wiki2vec.get_entity_vector(en2_name) except: en2 = nu data.append([en1_name, en2_name, en1, en2, -1, s, n]) print(data[-1][-1]) # print(len(data)) return data
def __init__(self): if torch.cuda.is_available(): torch.cuda.manual_seed(123) self.device = torch.device("cuda") else: torch.manual_seed(123) self.device = torch.device("cpu") self.model = Wikipedia2Vec.load(MODEL_FILE)
def __init__(self, path, prefix="ENTITY/", do_cache_dict=True, do_lower_case=False): from wikipedia2vec import Wikipedia2Vec, Dictionary if os.path.exists(path): self.model = Wikipedia2Vec.load(path) elif os.path.exists(os.path.join(RESOURCE_DIR, "wikipedia2vec", path)): self.model = Wikipedia2Vec.load( os.path.join(RESOURCE_DIR, "wikipedia2vec", path)) else: raise Exception() self.dict_cache = None if do_cache_dict: self.dict_cache = {} self.prefix = prefix self.do_lower_case = do_lower_case assert self.prefix + "San_Francisco" in self assert self.prefix + "St_Linus" in self
def main(model_file, tensor_file, metadata_file, config_file, model_name, base_url, word_size, entity_size): model = Wikipedia2Vec.load(model_file) words = [ w for w in sorted(model.dictionary.words(), key=lambda w: w.count, reverse=True)[:word_size] ] entities = [ e for e in sorted(model.dictionary.entities(), key=lambda w: w.count, reverse=True)[:entity_size] ] with open(tensor_file, mode='w', encoding='utf-8') as ten: with open(metadata_file, mode='w', encoding='utf-8') as meta: meta.write('item\ttype\tcount\n') for word in words: if re.match(r"^\s*$", word.text): continue vector_str = '\t'.join( ['%.5f' % v for v in model.get_vector(word)]) ten.write(vector_str + '\n') meta.write('WORD/%s\tword\t%d\n' % (word.text, word.count)) for entity in entities: vector_str = '\t'.join( ['%.5f' % v for v in model.get_vector(entity)]) ten.write(vector_str + '\n') meta.write('ENT/%s\tentity\t%d\n' % (entity.title, entity.count)) if model_name is None: model_name = languages.get(alpha2=model.dictionary.language).name config_obj = { 'embeddings': [{ "tensorName": model_name, 'tensorShape': [word_size + entity_size, model.syn0.shape[1]], "tensorPath": base_url + tensor_file, "metadataPath": base_url + metadata_file }] } with open(config_file, mode='w', encoding='utf-8') as f: json.dump(config_obj, f, indent=2, sort_keys=True)
def train_classifier(wikipedia2vec_file, entity_linker_file, dataset, dataset_path, dev_size, **kwargs): if dataset == '20ng': data = load_20ng_dataset(dev_size) else: data = load_r8_dataset(dataset_path, dev_size) for key, value in DEFAULT_HYPER_PARAMS[dataset].items(): if kwargs[key] is None: kwargs[key] = value tokenizer = RegexpTokenizer() entity_linker = EntityLinker(entity_linker_file) embedding = Wikipedia2Vec.load(wikipedia2vec_file) return train(data, embedding, tokenizer, entity_linker, **kwargs)
def word_embedder(self): """Sets up the Wikipedia2Vec model from the default file used by this application. Returns: Wikipedia2Vec: A Wikipedia2Vec Model """ loc = self.model_loc(self.WIKIPEDIA_2_VEC_MODEL_NAME) logger.info(f'Loading Wikipedia2Vec word embeddings model from {loc}.') model = Wikipedia2Vec.load(loc) logger.debug('Model loaded successfully.') logger.debug('Extracting dimension from filename.') dim = int(re.search('.*_(\d*)d\.', self.WIKIPEDIA_2_VEC_MODEL_NAME).group(1)) self.wordvec_dim = dim logger.debug(f'Assuming dimension {dim} for {loc}.') return model
def load_wiki2vec(): try: wiki2vec = Wikipedia2Vec.load('./enwiki_20180420_500d.pkl') except: wiki2vec = Wikipedia2Vec.load('./enwiki_20180420_100d.pkl') return Wiki2Vec(wiki2vec, torch.device('cpu'))
def __init__(self, path: str): super().__init__() self.__path: str = path self.set_model(Wikipedia2Vec.load(self.__path))
import os, re import pickle #import gensim import time, random from wikipedia2vec import Wikipedia2Vec vec_count = 50 #fkk = open('id2title.pickle', 'rb') #id2title = pickle.load(fkk) ''' fkk = open('id2id.pickle', 'rb') id2id = pickle.load(fkk) ''' wiki2vec = Wikipedia2Vec.load('enwiki-20190420-50d.pkl') #model = gensim.models.Word2Vec.load("gensim_100_model") #print(type(model)) counter,count_valid = 0,0 #all_array = np.zeros(vec_count) mmap = dict() beg = time.time() pre_emb = np.load('ent_embeddings_50.npy') # fillin by random
def __init__(self): if path.exists('wiki_tagger.pkl'): print('model already made') self.model = Wikipedia2Vec.load('wiki_tagger.pkl') else: print('no model found')
def load_model(self): try: return Wikipedia2Vec.load(self.reference) except (FileNotFoundError, KeyError): raise FileNotFoundError
"relation", "e1_pos_begin", "e1_pos_end", "e2_pos_begin", "e2_pos_end" ]: df[column] = pd.to_numeric(df[column]) df["label"] = get_label(df["relation"].tolist(), len(Relations)) df["len"] = df["words"].map(lambda x: len(x)) df["words"] = df["words"].map(lambda x: extend(x, ["."])) df["words"] = df["words"].map(lambda x: x[:FIXED_SIZE]) df["words"] = df["words"].map( lambda x: extend(x, ["BLANK" for _ in range(FIXED_SIZE - len(x))])) df["e1"] = df["e1_pos_end"] df["e2"] = df["e2_pos_end"] # 决定使用wiki百科语料训练的词向量来进行句子表示 os.chdir("/home/zy/data/wiki_win10_300d_20180420") wiki_model = Wikipedia2Vec.load("enwiki_20180420_win10_300d.pkl") all_words = set() for i in range(len(df)): words = set(df["words"][i]) all_words = all_words.union(words) word_to_index = {} vec_list = [] index = 0 unrecord_word_cnt = 0 for word in all_words: if word == "BLANK": vec_list.append(np.zeros(shape=(EMBEDDING_DIM, ), dtype="float32")) word_to_index[word] = index
np.set_printoptions(suppress=True) import matplotlib.pyplot as plt import nltk nltk.download('stopwords') from nltk.corpus import stopwords import csv import scipy from scipy import stats def cos_sim(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) wiki2vec = Wikipedia2Vec.load('../enwiki_20180420_100d.pkl') N = 72 A = np.empty(N) B = np.empty(N) C = np.empty(N) D = np.empty(N) E = np.empty(N) F = np.empty(N) vectors = np.empty([N, 100]) f = open('metusalem2012_experiment.csv', 'w') writer = csv.writer(f, lineterminator='\n')
def wikipedia2VecDemo(): with open('enwiki_20180420_100d.pkl.bz2', 'rb') as MODEL_FILE: model = Wikipedia2Vec.load(MODEL_FILE) print(model.get_entity_vector('Scarlett Johansson'))
from wikipedia2vec import Wikipedia2Vec model = Wikipedia2Vec.load('wiki_tagger.pkl') newtag = input('input tag') try: model.get_word_vector(newtag.lower()) except: try: model.get_entity_vector(newtag) except: try: model.get_entity_vector(newtag.lower()) except: print('adding failed') quit() if newtag not in open('app/scraping/mldata/tags.txt','r').readlines(): with open('app/scraping/mldata/tags.txt','a') as f: f.write('\n'+newtag) else: print('tag already exists')
def __init__(self, model_path): self.model = Wikipedia2Vec.load(model_path) self.vector_size = self.model.train_params["dim_size"] print('Model loaded')
def __init__(self, entity_filename=config.out_ent_filename): self.wiki2vec = Wikipedia2Vec.load(config.wiki_model_file) self.entity_filename = entity_filename
os.path.join(args.wiki_preprocess, 'entity_vocab.txt')) print(f"# word in dataset: {len(word_vocab)}") print(f"# entity in dataset: {len(entity_vocab)}") path = os.path.join(args.wiki_preprocess, 'inlinks.txt') with open(path, 'r') as f: for line in tqdm(f, leave=False, dynamic_ncols=True, desc="Read inlniks"): links = json.loads(line) for word in links['inlinks']: word_vocab.add(word) print(f"# word in dataset + inlinks: {len(word_vocab)}") wiki2vec = Wikipedia2Vec.load(args.wiki2vec) inwiki_words_num = 0 word_vecs = [] word_vocab_path = os.path.join(args.wiki_preprocess, 'word_vocab.txt') with open(word_vocab_path, 'w') as f: for word in tqdm(sorted(list(word_vocab)), leave=False, dynamic_ncols=True, desc="Filter in-wiki words"): try: vec = wiki2vec.get_word_vector(word) word_vecs.append(vec) f.write(word + "\n") inwiki_words_num += 1 except KeyError: pass
from wikipedia2vec import Wikipedia2Vec import unidic_lite import MeCab tagger = MeCab.Tagger(unidic_lite.DICDIR) MODEL_FILE = '/usr/src/taku/jawiki_20180420_300d.pkl' wiki2vec = Wikipedia2Vec.load(MODEL_FILE) def sentence_to_words(s, max_len=512): # print(s) words = [] node = tagger.parseToNode(s) while node is not None: #if node.feature.startswith('名詞'): try: words.append(node.surface) except: #print("Caught it!") pass node = node.next return [word for word in words if len(word) > 0][:max_len] def sentence_to_wordvecs(s, max_len=512, require_words=False): words = sentence_to_words(s, max_len) vecs = [] words_per_sentence = [] for word in words: current_length = len(vecs) try:
headers = {"User-Agent": "My-Bot-Name/1.0"} req = requests.get(API_URL, headers=headers, params=params) res = req.json() revision = res["query"]["pages"][0]["revisions"][0] text = revision["slots"]["main"]["content"] return mwparserfromhell.parse(text) anchors = pickle.load(open("./data/{0}/{0}.anchors.pkl".format(lang), "rb")) pageids = pickle.load(open("./data/{0}/{0}.pageids.pkl".format(lang), "rb")) redirects = pickle.load(open("./data/{0}/{0}.redirects.pkl".format(lang), "rb")) from wikipedia2vec import Wikipedia2Vec w2file = './data/{0}/{0}.w2v.bin'.format(lang) word2vec = Wikipedia2Vec.load(w2file) import fasttext navfile = './data/{0}/{0}.nav.bin'.format(lang) nav2vec = fasttext.load_model(navfile) import xgboost as xgb model = xgb.XGBClassifier() # init model model.load_model('./data/{0}/0001.link.bin'.format(lang)) # load data app = Flask(__name__) app.config["DEBUG"] = True app.config['JSON_SORT_KEYS'] = False CUSTOM_UA = 'reader session app -- [email protected]' THRESHOLD = 0.95
import pickle #import gensim import time from wikipedia2vec import Wikipedia2Vec #files = os.listdir('./rel') vec_count = 100 fkk = open('id2title.pickle', 'rb') id2title = pickle.load(fkk) ''' fkk = open('id2id.pickle', 'rb') id2id = pickle.load(fkk) ''' wiki2vec = Wikipedia2Vec.load('enwiki-20190420-50d-disambi.pkl') #model = gensim.models.Word2Vec.load("gensim_100_model") #print(type(model)) counter, count_valid = 0, 0 #all_array = np.zeros(vec_count) fg = open('bad_fb', 'w') mmap = dict() beg = time.time() pre_emb = np.load('ent_embeddings_50.npy')
def model_gen(): model = Wikipedia2Vec.load( '/home/seanammirati/dev/audio_visual_gen/embeddings/enwiki_20180420_100d.pkl' ) return model
def main(data_dir, model_file, out_format, word_analogy, word_similarity, entity_similarity, lowercase, batch_size, analogy_vocab_size): model = Wikipedia2Vec.load(model_file) results = [] if word_similarity: base_dir = os.path.join(os.path.join(data_dir, 'word'), 'similarity') for filename in os.listdir(base_dir): if not filename.endswith('.txt'): continue oov_count = 0 with open(os.path.join(base_dir, filename)) as f: gold = [] estimated = [] for line in f: (w1, w2, val) = line.split() val = float(val) if lowercase: (w1, w2) = (w1.lower(), w2.lower()) try: v1 = model.get_word_vector(w1) except KeyError: oov_count += 1 continue try: v2 = model.get_word_vector(w2) except KeyError: oov_count += 1 continue gold.append(val) estimated.append(1.0 - cosine(v1, v2)) results.append( (filename[:-4], spearmanr(gold, estimated)[0], oov_count)) if word_analogy: if analogy_vocab_size is None: target_words = [w.text for w in model.dictionary.words()] else: target_words = [ w.text for w in sorted(model.dictionary.words(), key=lambda w: w.count, reverse=True)[:analogy_vocab_size] ] word_emb = np.empty((len(target_words), model.syn0.shape[1])) vocab = {} for (n, word) in enumerate(target_words): word_emb[n] = model.get_word_vector(word) vocab[word] = n word_emb = word_emb / np.linalg.norm( word_emb, 2, axis=1, keepdims=True) base_dir = os.path.join(os.path.join(data_dir, 'word'), 'analogy') for filename in os.listdir(base_dir): with open(os.path.join(base_dir, filename)) as f: (A_ind, B_ind, C_ind, D_ind) = ([], [], [], []) oov_count = 0 for (n, line) in enumerate(f): if not line.startswith(':'): if lowercase: indices = list(map(vocab.get, line.lower().split())) else: indices = list(map(vocab.get, line.split())) if not all(i is not None for i in indices): oov_count += 1 continue (a_ind, b_ind, c_ind, d_ind) = indices A_ind.append(a_ind) B_ind.append(b_ind) C_ind.append(c_ind) D_ind.append(d_ind) (A, B, C) = (word_emb[A_ind], word_emb[B_ind], word_emb[C_ind]) D = (B - A + C) del A, B, C predictions = [] for i in trange(0, D.shape[0], batch_size, desc=filename[:-4]): D_batch = D[i:i + batch_size] dot_ret = np.dot(word_emb, D_batch.T) for (j, indices) in enumerate( zip(A_ind[i:i + batch_size], B_ind[i:i + batch_size], C_ind[i:i + batch_size])): dot_ret[indices, j] = float('-inf') predictions.append(np.argmax(dot_ret, 0)) results.append( (filename[:-4], np.mean(np.hstack(predictions) == D_ind), oov_count)) if entity_similarity: category_mapping = { e: c for (c, l) in KORE_CATEGORIES.items() for e in l } base_dir = os.path.join(os.path.join(data_dir, 'entity'), 'similarity') for filename in os.listdir(base_dir): with open(os.path.join(base_dir, filename)) as f: if filename == 'KORE.txt': data = defaultdict(list) title = None for line in f: line = line.rstrip() if line.startswith('\t'): data[title].append(line[1:]) else: title = line kore_results = defaultdict(list) oov_count = 0 for (title, title_list) in data.items(): try: v1 = model.get_entity_vector(title) except KeyError: oov_count += len(title_list) continue estimated = [] for title2 in title_list: try: v2 = model.get_entity_vector(title2) except KeyError: oov_count += 1 continue estimated.append(1.0 - cosine(v1, v2)) gold = list(reversed(range(len(estimated)))) kore_results[category_mapping[title]].append( spearmanr(gold, estimated)[0]) results.append( (filename[:-4], np.mean(list(chain(*kore_results.values()))), oov_count)) else: gold = [] estimated = [] oov_count = 0 for (n, line) in enumerate(f): if n == 0: continue line = line.rstrip() (_, _, title1, _, _, title2, score) = line.split('\t') try: v1 = model.get_entity_vector( title1.replace('_', ' ')) except KeyError: oov_count += 1 continue try: v2 = model.get_entity_vector( title2.replace('_', ' ')) except KeyError: oov_count += 1 continue gold.append(float(score)) estimated.append(1.0 - cosine(v1, v2)) results.append( (filename[:-4], spearmanr(gold, estimated)[0], oov_count)) if out_format == 'text': for (name, score, oov_count) in results: print('%s: ' % name) print(' Spearman score: %.4f' % score) print(' OOV instances: %d' % oov_count) elif out_format == 'csv': print('name,' + ','.join([o[0] for o in results])) print('score,' + ','.join(['%.4f' % o[1] for o in results])) print('oov,' + ','.join(['%d' % o[2] for o in results]))