def mturk_test(): embedding_dim = 300 Y = get_mturk_outcomes() Y = (Y - np.mean(Y, axis=0)) / np.std(Y, axis=0) X = np.load('../data/mturk_embedded.npz') # X0 = X['arr_0'] # X1 = X['arr_1'] X2 = X['arr_2'] # X3 = X['arr_3'] m = SWEM(embedding_dimension=embedding_dim, num_outputs=2, learning_rate=1e-4, activation_fn=tf.nn.elu, embedding_mlp_depth=2, prediction_mlp_layers=(120, 24)) # m.train(X0, Y, plotfile='../img/X0_training.png') m.train(X2, Y[:, :2], plotfile='../img/X2_Y01_training.png', batch_size=100, epochs=20) # m.train(X2, Y, plotfile='../img/X2_training.png')
def random_noise_test(): embedding_dim = 300 data_size = 1000 X = [ np.random.randn(np.random.randint(10, 100), embedding_dim) for i in range(data_size) ] Y = .2 * np.random.randn(data_size) + .5 m = SWEM(embedding_dimension=embedding_dim) m.train(X, Y, plotfile='../img/test_training.png')
def get(ddir: str, ft_path: str, split: str): random.seed(1111) ddir = Path(ddir) ft_model = fastText.load_model(ft_path) swem = SWEM(ft_model) quality = lf.TextDataset(str(ddir / (f'quality.{split}.txt'))).map(int) sent1 = lf.TextDataset(str(ddir / (f'sent1.{split}.txt'))).map(sent_preprocess(swem)) sent2 = lf.TextDataset(str(ddir / (f'sent2.{split}.txt'))).map(sent_preprocess(swem)) ds = lf.zip(quality, sent1, sent2) return ds
def test_get(ddir: str, savedir: str, bsize: int, ft_path: str): ddir = Path(ddir) savedir = Path(savedir) ft_model = fastText.load_model(ft_path) swem = SWEM(ft_model) quality = lf.TextDataset(str(ddir / ('quality.test.txt'))).map(int) sent1 = lf.TextDataset(str(ddir / ('sent1.test.txt'))).map(sent_preprocess(swem)) sent2 = lf.TextDataset(str(ddir / ('sent2.test.txt'))).map(sent_preprocess(swem)) ds = lf.zip(quality, sent1, sent2) test_dataloader = DataLoader( ds.save(savedir / 'swem.test.cache'), batch_size=bsize, shuffle=False, num_workers=4, collate_fn=get_collate_fn() ) return test_dataloader
def main(name_list_file, postfix, w2v_model, pkl_filename): f = open(name_list_file) reader = csv.reader(f) header = next(reader) nouns_set = build_nouns_set(name_list_file, postfix) # Word2Vecモデルのロード時、モデルファイルの形式にあわせてロード手順を変えること # KeyedVectorsの場合(save_word2vec_formatで保存したもの) from gensim.models import KeyedVectors w2v = KeyedVectors.load_word2vec_format(w2v_model, binary=True) # .bin形式の場合 # w2v = KeyedVectors.load_word2vec_format(w2v_model, binary=False) # .txt形式の場合 # Word2Vec.saveで保存したもの # import gensim.models.doc2vec as doc2vec # w2v = doc2vec.Doc2Vec.load("pixiv/doc2vec.model") # fastTextモデルの場合 # from gensim.models.wrappers.fasttext import FastText # w2v = FastText.load_fasttext_format('pixiv/fasttext-model.bin') tokenizer = MeCabTokenizerWithStopWord( mecab_args=f"-O wakati -d {mecab_system_dic}", nouns=nouns_set) swem = SWEM(w2v, tokenizer) names = [] vecs = [] attributes = {} for row in reader: name = row[0] nickname = row[1] attribute = row[2] if attribute not in attributes: attributes[attribute] = [] with open("data/" + name + ".txt") as n: text = n.read() text = normalize(text) vec = swem.average_pooling(text) # vec = swem.max_pooling(text) if postfix in name: name = name.replace(postfix, "") names.append(name) vecs.append(vec) attributes[attribute].append(vec) # 各属性の平均ベクトルを作る for key in attributes: ave = np.average([v for v in attributes[key]], axis=0) names.append(key) vecs.append(ave) f.close() idolvecs = [names, vecs] # pklファイルへ保存 with open(pkl_filename, 'wb') as pkl: pickle.dump(idolvecs, pkl)
def get_request(doc): return { "_op_type": "index", "_index": INDEX_NAME, "text": doc["text"], "title": doc["title"], "text_vector": swem.average_pooling(doc["text"]).tolist() } # embedding w2v_path = "jawiki.word_vectors.200d.txt" w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=False) tokenizer = MeCabTokenizer("-O wakati") swem = SWEM(w2v, tokenizer) # elasticsearch client = Elasticsearch("http://es-study:9200") BATCH_SIZE = 1000 INDEX_NAME = "wikipedia" client.indices.delete(index=INDEX_NAME, ignore=[404]) with open("index.json") as index_file: source = index_file.read().strip() client.indices.create(index=INDEX_NAME, body=source) docs = [] count = 0 with gzip.open("jawikisource-20210510-cirrussearch-content.json.gz") as f: for line in f: