def load_or_build_embedding(ds, vocab): # One-hot embedding # embd = eye(len(vocab)) # return embd # Read Word Vectors # word_vector_file = 'data/glove.6B/glove.6B.300d.txt' # word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt' #_, embd, word_vector_map = loadWord2Vec(word_vector_file) # word_embeddings_dim = len(embd[0]) try: word_vector_file = 'data/corpus/' + ds + '_word_vectors.txt' word_vec_vocab, embd, word_vec_id_map = loadWord2Vec(word_vector_file) word_embeddings_dim = len(embd[0]) # word embedding matrix wm = np.matrix(embd) return word_vec_vocab, wm, word_vec_id_map except: print('Building embedding...') definitions = [] for word in vocab: word = word.strip() synsets = wn.synsets(clean_str(word)) word_defs = [] for synset in synsets: syn_def = synset.definition() word_defs.append(syn_def) word_des = ' '.join(word_defs) if word_des == '': word_des = '<PAD>' definitions.append(word_des) tfidf_vec = TfidfVectorizer(max_features=1000) tfidf_matrix = tfidf_vec.fit_transform(definitions) tfidf_matrix_array = tfidf_matrix.toarray() word_vectors = [] for i in range(len(vocab)): word = vocab[i] vector = tfidf_matrix_array[i] str_vector = [] for j in range(len(vector)): str_vector.append(str(vector[j])) temp = ' '.join(str_vector) word_vector = word + ' ' + temp word_vectors.append(word_vector) string = '\n'.join(word_vectors) f = open('data/corpus/' + ds + '_word_vectors.txt', 'w') f.write(string) f.close() return load_or_build_embedding(ds, vocab)
import sys, logging from gensim.models import Word2Vec from sklearn.cluster import KMeans from utils import loadWord2Vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('food2vec') filename = 'food2vec.model.txt' food2vec = loadWord2Vec(filename) vectors = food2vec.syn0 clustersNo = 10 logger.info("Preparing clusters...") kmeans = KMeans(n_clusters=clustersNo) idx = kmeans.fit_predict(vectors) logger.info("Clusters are ready!") wordMap = dict(zip(food2vec.index2word, idx)) for cluster in xrange(0, clustersNo): print "\nCluster %d" % cluster words = [] for i in xrange(0, len(wordMap.values())): if (wordMap.values()[i] == cluster): words.append(wordMap.keys()[i])
from math import log import sys if len(sys.argv) != 2: sys.exit("Use: python build_graph.py <dataset>") datasets = ['i2b2', 'mimic'] # build corpus dataset = sys.argv[1] # Read Word Vectors model_dir = 'PATH/TO/WORD2VEC/MODEL' word_vector_file = model_dir + 'glove.6B.100d.txt' # word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt' _, embd, word_vector_map = loadWord2Vec(word_vector_file) word_embeddings_dim = len(embd[0]) # word_embeddings_dim = 30 # word_vector_map = {} # shulffing doc_name_list = [] doc_train_list = [] doc_test_list = [] f = open('data/' + dataset + '.txt', 'r') lines = f.readlines() for line in lines: doc_name_list.append(line.strip()) temp = line.split("\t")
def build_graph(): dataset = 'own_wms_all' word_embeddings_dim = 300 word_vector_map = {} # shulffing doc_name_list = [] doc_train_list = [] doc_test_list = [] f = open('data/' + dataset + '.txt', 'r') lines = f.readlines() for line in lines: doc_name_list.append(line.strip()) temp = line.split("\t") if temp[1].find('test') != -1: doc_test_list.append(line.strip()) elif temp[1].find('train') != -1: doc_train_list.append(line.strip()) f.close() # print(doc_train_list) # print(doc_test_list) doc_content_list = [] f = open('data/corpus/' + dataset + '.clean.txt', 'r') lines = f.readlines() for line in lines: doc_content_list.append(line.strip()) f.close() # print(doc_content_list) train_ids = [] for train_name in doc_train_list: train_id = doc_name_list.index(train_name) train_ids.append(train_id) print(train_ids) random.shuffle(train_ids) # partial labeled data # train_ids = train_ids[:int(0.2 * len(train_ids))] train_ids_str = '\n'.join(str(index) for index in train_ids) f = open('data/' + dataset + '.train.index', 'w') f.write(train_ids_str) f.close() test_ids = [] for test_name in doc_test_list: test_id = doc_name_list.index(test_name) test_ids.append(test_id) print(test_ids) random.shuffle(test_ids) test_ids_str = '\n'.join(str(index) for index in test_ids) f = open('data/' + dataset + '.test.index', 'w') f.write(test_ids_str) f.close() ids = train_ids + test_ids #print(ids) print(len(ids)) shuffle_doc_name_list = [] shuffle_doc_words_list = [] for id in ids: shuffle_doc_name_list.append(doc_name_list[int(id)]) shuffle_doc_words_list.append(doc_content_list[int(id)]) shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list) shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list) f = open('data/' + dataset + '_shuffle.txt', 'w') f.write(shuffle_doc_name_str) f.close() f = open('data/corpus/' + dataset + '_shuffle.txt', 'w') f.write(shuffle_doc_words_str) f.close() # build vocab word_freq = {} word_set = set() for doc_words in shuffle_doc_words_list: words = doc_words.split() for word in words: word_set.add(word) if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 vocab = list(word_set) vocab_size = len(vocab) word_doc_list = {} for i in range(len(shuffle_doc_words_list)): doc_words = shuffle_doc_words_list[i] words = doc_words.split() appeared = set() for word in words: if word in appeared: continue if word in word_doc_list: doc_list = word_doc_list[word] doc_list.append(i) word_doc_list[word] = doc_list else: word_doc_list[word] = [i] appeared.add(word) word_doc_freq = {} for word, doc_list in word_doc_list.items(): word_doc_freq[word] = len(doc_list) word_id_map = {} for i in range(vocab_size): word_id_map[vocab[i]] = i vocab_str = '\n'.join(vocab) f = open('data/corpus/' + dataset + '_vocab.txt', 'w') f.write(vocab_str) f.close() definitions = [] for word in vocab: word = word.strip() synsets = wn.synsets(clean_str(word)) word_defs = [] for synset in synsets: syn_def = synset.definition() word_defs.append(syn_def) word_des = ' '.join(word_defs) if word_des == '': word_des = '<PAD>' definitions.append(word_des) string = '\n'.join(definitions) f = open('data/corpus/' + dataset + '_vocab_def.txt', 'w') f.write(string) f.close() tfidf_vec = TfidfVectorizer(max_features=1000) tfidf_matrix = tfidf_vec.fit_transform(definitions) tfidf_matrix_array = tfidf_matrix.toarray() #print(tfidf_matrix_array[0], len(tfidf_matrix_array[0])) word_vectors = [] for i in range(len(vocab)): word = vocab[i] vector = tfidf_matrix_array[i] str_vector = [] for j in range(len(vector)): str_vector.append(str(vector[j])) temp = ' '.join(str_vector) word_vector = word + ' ' + temp word_vectors.append(word_vector) string = '\n'.join(word_vectors) f = open('data/corpus/' + dataset + '_word_vectors.txt', 'w') f.write(string) f.close() word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt' _, embd, word_vector_map = loadWord2Vec(word_vector_file) word_embeddings_dim = len(embd[0]) # label list label_set = set() # 不重复的序列 for doc_meta in shuffle_doc_name_list: temp = doc_meta.split('\t') label_set.add(temp[2]) label_list = list(label_set) label_list_str = '\n'.join(label_list) f = open('data/corpus/' + dataset + '_labels.txt', 'w') f.write(label_list_str) f.close() # x: feature vectors of training docs, no initial features # slect 90% training set train_size = len(train_ids) val_size = int(0.1 * train_size) real_train_size = train_size - val_size # - int(0.5 * train_size) # different training rates real_train_doc_names = shuffle_doc_name_list[:real_train_size] real_train_doc_names_str = '\n'.join(real_train_doc_names) f = open('data/' + dataset + '.real_train.name', 'w') f.write(real_train_doc_names_str) f.close() row_x = [] col_x = [] data_x = [] for i in range(real_train_size): doc_vec = np.array([0.0 for k in range(word_embeddings_dim)]) doc_words = shuffle_doc_words_list[i] words = doc_words.split() doc_len = len(words) for word in words: if word in word_vector_map: word_vector = word_vector_map[word] # print(doc_vec) # print(np.array(word_vector)) doc_vec = doc_vec + np.array(word_vector) for j in range(word_embeddings_dim): row_x.append(i) col_x.append(j) # np.random.uniform(-0.25, 0.25) data_x.append(doc_vec[j] / doc_len) # doc_vec[j]/ doc_len # x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32) x = sp.csr_matrix((data_x, (row_x, col_x)), shape=(real_train_size, word_embeddings_dim)) y = [] for i in range(real_train_size): doc_meta = shuffle_doc_name_list[i] temp = doc_meta.split('\t') temp2 = temp[2].split(',') one_hot = [] for i in range(len(temp2)): if temp2[i] == '0': one_hot.append(0) elif temp2[i] == '1': one_hot.append(1) y.append(one_hot) y = np.array(y) print(y) # tx: feature vectors of test docs, no initial features test_size = len(test_ids) row_tx = [] col_tx = [] data_tx = [] for i in range(test_size): doc_vec = np.array([0.0 for k in range(word_embeddings_dim)]) doc_words = shuffle_doc_words_list[i + train_size] words = doc_words.split() doc_len = len(words) for word in words: if word in word_vector_map: word_vector = word_vector_map[word] doc_vec = doc_vec + np.array(word_vector) for j in range(word_embeddings_dim): row_tx.append(i) col_tx.append(j) # np.random.uniform(-0.25, 0.25) data_tx.append(doc_vec[j] / doc_len) # doc_vec[j] / doc_len # tx = sp.csr_matrix((test_size, word_embeddings_dim), dtype=np.float32) tx = sp.csr_matrix((data_tx, (row_tx, col_tx)), shape=(test_size, word_embeddings_dim)) ty = [] for i in range(test_size): doc_meta = shuffle_doc_name_list[i + train_size] temp = doc_meta.split('\t') temp2 = temp[2].split(',') one_hot = [] for i in range(len(temp2)): if temp2[i] == '0': one_hot.append(0) elif temp2[i] == '1': one_hot.append(1) #for i in range(len(temp)): # if i>1: # if temp[i]=='0': # one_hot.append(0) # elif temp[i]=='1': # one_hot.append(1) ty.append(one_hot) ty = np.array(ty) print(ty) # allx: the the feature vectors of both labeled and unlabeled training instances # (a superset of x) # unlabeled training instances -> words word_vectors = np.random.uniform( -0.01, 0.01, (vocab_size, word_embeddings_dim )) # vocab_size = len(vocab) word_embeddings_dim = len(embd[0]) for i in range(len(vocab)): word = vocab[i] if word in word_vector_map: vector = word_vector_map[word] word_vectors[i] = vector row_allx = [] col_allx = [] data_allx = [] for i in range(train_size): doc_vec = np.array([0.0 for k in range(word_embeddings_dim)]) doc_words = shuffle_doc_words_list[i] words = doc_words.split() doc_len = len(words) for word in words: if word in word_vector_map: word_vector = word_vector_map[word] doc_vec = doc_vec + np.array(word_vector) for j in range(word_embeddings_dim): row_allx.append(int(i)) col_allx.append(j) # np.random.uniform(-0.25, 0.25) data_allx.append(doc_vec[j] / doc_len) # doc_vec[j]/doc_len for i in range(vocab_size): for j in range(word_embeddings_dim): row_allx.append(int(i + train_size)) col_allx.append(j) data_allx.append(word_vectors.item((i, j))) row_allx = np.array(row_allx) col_allx = np.array(col_allx) data_allx = np.array(data_allx) allx = sp.csr_matrix((data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embeddings_dim)) ally = [] for i in range(train_size): doc_meta = shuffle_doc_name_list[i] temp = doc_meta.split('\t') temp2 = temp[2].split(',') one_hot = [] for i in range(len(temp2)): if temp2[i] == '0': one_hot.append(0) elif temp2[i] == '1': one_hot.append(1) ally.append(one_hot) for i in range(vocab_size): one_hot = [0 for l in range(len(classes))] ally.append(one_hot) ally = np.array(ally) print(ally) print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape) ''' Doc word heterogeneous graph ''' # word co-occurence with context windows window_size = 20 windows = [] for doc_words in shuffle_doc_words_list: words = doc_words.split() length = len(words) if length <= window_size: windows.append(words) else: # print(length, length - window_size + 1) for j in range(length - window_size + 1): window = words[j:j + window_size] windows.append(window) # print(window) word_window_freq = {} for window in windows: appeared = set() for i in range(len(window)): if window[i] in appeared: continue if window[i] in word_window_freq: word_window_freq[window[i]] += 1 else: word_window_freq[window[i]] = 1 appeared.add(window[i]) word_pair_count = {} for window in windows: for i in range(1, len(window)): for j in range(0, i): word_i = window[i] word_i_id = word_id_map[word_i] word_j = window[j] word_j_id = word_id_map[word_j] if word_i_id == word_j_id: continue word_pair_str = str(word_i_id) + ',' + str(word_j_id) if word_pair_str in word_pair_count: word_pair_count[word_pair_str] += 1 else: word_pair_count[word_pair_str] = 1 # two orders word_pair_str = str(word_j_id) + ',' + str(word_i_id) if word_pair_str in word_pair_count: word_pair_count[word_pair_str] += 1 else: word_pair_count[word_pair_str] = 1 row = [] col = [] weight = [] # pmi as weights num_window = len(windows) for key in word_pair_count: temp = key.split(',') i = int(temp[0]) j = int(temp[1]) count = word_pair_count[key] word_freq_i = word_window_freq[vocab[i]] word_freq_j = word_window_freq[vocab[j]] pmi = log( (1.0 * count / num_window) / (1.0 * word_freq_i * word_freq_j / (num_window * num_window))) if pmi <= 0: continue row.append(train_size + i) col.append(train_size + j) weight.append(pmi) # word vector cosine similarity as weights ''' for i in range(vocab_size): for j in range(vocab_size): if vocab[i] in word_vector_map and vocab[j] in word_vector_map: vector_i = np.array(word_vector_map[vocab[i]]) vector_j = np.array(word_vector_map[vocab[j]]) similarity = 1.0 - cosine(vector_i, vector_j) if similarity > 0.9: print(vocab[i], vocab[j], similarity) row.append(train_size + i) col.append(train_size + j) weight.append(similarity) ''' # doc word frequency doc_word_freq = {} for doc_id in range(len(shuffle_doc_words_list)): doc_words = shuffle_doc_words_list[doc_id] words = doc_words.split() for word in words: word_id = word_id_map[word] doc_word_str = str(doc_id) + ',' + str(word_id) if doc_word_str in doc_word_freq: doc_word_freq[doc_word_str] += 1 else: doc_word_freq[doc_word_str] = 1 for i in range(len(shuffle_doc_words_list)): doc_words = shuffle_doc_words_list[i] words = doc_words.split() doc_word_set = set() for word in words: if word in doc_word_set: continue j = word_id_map[word] key = str(i) + ',' + str(j) freq = doc_word_freq[key] if i < train_size: row.append(i) else: row.append(i + vocab_size) col.append(train_size + j) idf = log(1.0 * len(shuffle_doc_words_list) / word_doc_freq[vocab[j]]) weight.append(freq * idf * spi[lemma(word)]) # doc_word_set.add(word) node_size = train_size + vocab_size + test_size adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size)) # dump objects f = open("data/ind.{}.x".format(dataset), 'wb') pkl.dump(x, f) f.close() f = open("data/ind.{}.y".format(dataset), 'wb') pkl.dump(y, f) f.close() f = open("data/ind.{}.tx".format(dataset), 'wb') pkl.dump(tx, f) f.close() f = open("data/ind.{}.ty".format(dataset), 'wb') pkl.dump(ty, f) f.close() f = open("data/ind.{}.allx".format(dataset), 'wb') pkl.dump(allx, f) f.close() f = open("data/ind.{}.ally".format(dataset), 'wb') pkl.dump(ally, f) f.close() f = open("data/ind.{}.adj".format(dataset), 'wb') pkl.dump(adj, f) f.close()
jsonstr = ''.join(f.readlines()) ent_dic = json.loads(jsonstr) else: cand_dic, ent_dic = GenerateCand('kb.json') # 生成训练、验证、测试的文本数据 if not os.path.exists('../data/generated/train_data.txt'): GeneratePairwaiseSample('train.json', cand_dic, ent_dic, is_train=True) if not os.path.exists('../data/generated/dev_data.txt'): GeneratePairwaiseSample('dev.json', cand_dic, ent_dic, is_train=False) if not os.path.exists('../data/generated/test_data.txt'): GeneratePairwaiseSample('test.json', cand_dic, ent_dic, is_train=False) # matrix 向量数组;vocab 包含 vocab["w2i"]: word2idx、vocab["i2w"]:idx2word;向量维度,字词数 if not os.path.exists('../data/pretrain_data/matrix.npy'): matrix, vocab, vec_dim, vocab_size = utils.loadWord2Vec( "../data/pretrain_data/word2vec.iter5") else: matrix = np.load('../data/pretrain_data/matrix.npy') with open('../data/pretrain_data/vocab.json', 'r', encoding='utf8') as f: jsonstr = ''.join(f.readlines()) vocab = json.loads(jsonstr) # 类型2标签字典 type2label = utils.type2label # 数据编码 data_encoder = DataEncoder(vocab["w2i"], type2label) if not os.path.exists('../data/generated/train.csv'): data_encoder.data_encode("../data/generated/train_data.txt", is_train=True) if not os.path.exists('../data/generated/dev.csv'): data_encoder.data_encode("../data/generated/dev_data.txt", is_train=False)
from warnings import warn import string import numpy as np from itertools import cycle from itertools import repeat # Try to load the word2vec model and the multilabelbinarizer w2vfile = './models/w2v' mlbfile = './models/mlb.pickle' w2v = False # Loading pickle files is faster, so check that one first if os.path.exists(w2vfile + '.pickle'): w2v = loadPickle(w2vfile + '.pickle') elif os.path.exists(w2vfile + '.bin'): w2v = loadWord2Vec(w2vfile + '.bin') else: warn( "{} not found, will not be able to sub or create word matrices".format( w2vfile)) if w2v: word_d = w2v.layer1_size prepare_mode = '-p' in sys.argv or '--prepare' in sys.argv or '-m' in sys.argv or '--make' in sys.argv if os.path.exists(mlbfile) and not prepare_mode: mlb = loadPickle(mlbfile) valid_hashtags = set(mlb.classes_) else: valid_hashtags = set()
import sys, logging import matplotlib.pyplot as plt from gensim.models import Word2Vec from sklearn.manifold import TSNE from utils import loadWord2Vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('food2vec') filename = 'food2vec.model.txt' food2vec = loadWord2Vec(filename) labels = set(food2vec.index2word) vectors = food2vec.syn0 logger.info("Preparing tsne transformation...") tsne = TSNE(perplexity=15, n_components=2, init='pca', n_iter=4000, early_exaggeration=8.0) vectors2d = tsne.fit_transform(vectors) logger.info('Trying to plot food2vec results...') plt.figure(figsize=(15, 15)) for i, label in enumerate(labels): x, y = vectors2d[i,:] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig('tsne.png')