def main(sentences, wordfile: str, weightfile: str, weightpara: float = 1e-3, rmpc: int = 1): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m, _ = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i
def fit(self, sentences, We, lowercase_tokens, embeddings_format, embeddings_filepath, params, word_map, weight4ind): # store these off for pickling or extra transforms self.word_map = word_map self.weight4ind = weight4ind self.params = params self.lowercase_tokens = lowercase_tokens self.embeddings_format = embeddings_format self.embeddings_filepath = embeddings_filepath self.sentence_count = len(sentences) x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, self.weight4ind) # get word weights # now let's do some of what happens in src/SIF_embedding.py # but also keep some pieces along the way #weighted_emb = get_weighted_average(We, x, w) weighted_emb = get_weighted_average_alternate(We, x, w) self.compute_pc(weighted_emb) self.trained = True return self.remove_pc(weighted_emb)
def compute_sif_emb(self, sentences): # load sentences x1, m = data_io.sentences2idx(sentences, self.words) w1 = data_io.seq2weight(x1, m, self.weight4ind) result = get_emb(self.We, x1, w1) return result
def getSIFscore(sentences: list, words, weight4ind, rmpc, We, params, sx: int, sy: int): # load sentences x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('load sentences finished') # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i embeddingSize = len(embedding) # print('embeddingSize= ',embeddingSize) emb = list() for x in range(embeddingSize): emb.append(embedding[x, :]) emb1 = emb[sx] emb2 = emb[sy] inn = (emb1 * emb2).sum() emb1norm = numpy.sqrt((emb1 * emb1).sum()) emb2norm = numpy.sqrt((emb2 * emb2).sum()) score = inn / emb1norm / emb2norm # print(sentences[sx],'--------',sentences[sy],' = ',score,'\n') return score
def return_sif(sentences, words, weight4ind, param, Weights): # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embeddings = SIF_embedding.SIF_embedding(Weights, x, w, param) # embedding[i,:] is the embedding for sentence i return embeddings
def get_embedding(sentence, words, weight4ind, params, We): # load sentences xx, mm = data_io.sentences2idx(sentence, words) ww = data_io.seq2weight(xx, mm, weight4ind) # get word weights # get SIF embedding em = SIF_embedding.SIF_embedding( We, xx, ww, params) # embedding[i,:] is the embedding for sentence i return em
def get_embs(sentences, params): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def generate_vecs(models, document): words, weight4ind, rmpc, We = models x, m = data_io.sentences2idx(document, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters param = params.params() param.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, param) # embedding[i,:] is the embedding for sentence i return embedding
def SIFSentEmbedding(weighttxt, docfile, words, We, weight4ind, weightpara=1e-3, paramm=1): # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # number of principal components to remove in SIF weighting scheme sentences = sent_tokenize(docfile) x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights paramm = params.params() paramm = paramm.LC embedding = SIF_embedding.SIF_embedding( We, x, w, paramm) # embedding[i,:] is the embedding for sentence i return embedding
def sif_embedding(sen): import sys #sys.path.append("../src") #sys.path.append("../data") import data_io, params, SIF_embedding import params import SIF_embedding # input wordfile = 'data/dic_files.txt' # word vector file, can be downloaded from GloVe website weightfile = 'data/dic_freq.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # sentences = ['这是一个例句', '这是一个更长一些的例句'] # sentences = ['昨天天气不错', '这是一个更长一些的例句'] sentences = sen # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer'] # load word vectors (words, We) = data_io.getWordmap(wordfile) # print(words,We) #单词,和词向量 # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location # print(x,m) w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('word weight:',w) # set parameters # params = params.params() params = params.params_all() # name 'params' is not defined params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def get_sent_vec(sentences): import params # 详见data_io.py (words, We) = data_io.getWordmap(wordfile) # 详见data_io.py word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
def sentences2embeddings(sentences): """ Input: sentences - a list of sentences Output: sentence_embeddings - a list of sentence embeddings (numpy vectors of shape (1,300)) """ # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters parameters = params.params() parameters.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, parameters) # embedding[i,:] is the embedding for sentence i sentence_embeddings = [] for i in range(len(sentences)): e = np.array(embedding[i,:]).reshape((1,300)) # reshape to fit into the function semantic_similarity sentence_embeddings.append(e) return sentence_embeddings
def get_sent_vec(sentences): ''' 通过SIF算法计算句向量 :param sentences: 类型为列表,列表内的元素为字句子(句子类型为字符串,不用通过jieba.cut分词处理) :return: 输出类型为字典,key为句子字符串,value为句向量列表 ''' import params # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 rmpc = 1 # number of principal components to remove in SIF weighting scheme params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
#if '20-30' in sentence_file or '25-35' in sentence_file or '30-40' \ #in sentence_file or '5-15' in sentence_file: batch_num = 0 with open(sentence_file, 'r', encoding='utf-8') as fr: print('Processing file', sentence_file, '...') p = hnswlib.Index(space='cosine', dim=dimension) p.init_index(max_elements = num_elements, ef_construction = 2000, M = 80) p.set_ef(1000) # Set number of threads used during batch search/construction # By default using all available cores p.set_num_threads(30) for n_lines in iter(lambda: tuple(islice(fr, batch_size)), ()): sents = list(map(str.strip, n_lines)) sent_id = list(map(lambda x:int(x.split('\t')[0]), sents)) sentences = list(map(lambda x:x.split('\t')[-1], sents)) x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i embeddings.normalize(embedding, ["unit", "center"]) p.add_items(embedding,sent_id) print('Finished batch', batch_num, '.', end = '\r') batch_num += 1 print('\nFinished loading', sentence_file, '.') out_file = sentence_file+'.ann' p.save_index(out_file) print('Finished saving', out_file, '.') del p
def top_filtering(logits, words, weight4ind, We, tokenizer, history, args, params, embedding1, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf'), current_output=None): """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering Args: logits: logits distribution shape (vocabulary size) top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset whose total probability mass is greater than or equal to the threshold top_p. In practice, we select the highest probability tokens whose cumulative probability mass exceeds the threshold top_p. threshold: a minimal threshold to keep logits """ if current_output is None: current_output = [] assert logits.dim( ) == 1 # Only work for batch size 1 for now - could update but it would obfuscate a bit the code top_k = min(top_k, logits.size(-1)) if top_k > 0: # Remove all tokens with a probability less than the last token in the top-k tokens indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if top_p > 0.0: # Compute cumulative probabilities of sorted tokens sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probabilities > top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # Back to unsorted indices and set them to -infinity indices_to_remove = sorted_indices[sorted_indices_to_remove] indices_to_use = sorted_indices[~sorted_indices_to_remove] logits[indices_to_remove] = filter_value if len(indices_to_use) > 1: cands = [current_output + [idx] for idx in indices_to_use.tolist()] raw_cands = [ tokenizer.decode(cand, skip_special_tokens=True) for cand in cands ] scores = [] for i in raw_cands: sentences = [i] x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) embedding2 = SIF_embedding.SIF_embedding(We, x, w, params) inn = (embedding1 * embedding2).sum(axis=1) emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1)) emb2norm = np.sqrt((embedding2 * embedding2).sum(axis=1)) scores.append(inn / emb1norm / emb2norm) #print(sentences) for idx, sim in zip(indices_to_use, scores): logits[idx] += sim.item() """ probs = F.softmax(logits, dim=-1) index = [] for i in probs: if i > 0: index.append(i) prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, len(index)) text = [] last_utt = history[-1] last = tokenizer.decode(last_utt, skip_special_tokens=True) sentences = [last] # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters global params params.rmpc = rmpc # get SIF embedding embedding1 = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i for i in prev: text.append(i.item()) for i in text: cand = current_output.copy() cand.append(i) indice = i raw_text=tokenizer.decode(cand, skip_special_tokens=True) sentences = [raw_text] x, m= data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) embedding2 = SIF_embedding.SIF_embedding(We, x, w, params) inn = (embedding1 * embedding2 ).sum(axis=1) emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1)) emb2norm = np.sqrt((embedding2 * embedding2 ).sum(axis=1)) scores = inn / emb1norm / emb2norm #print(scores) logits[indice] += scores.item() cand.clear() """ indices_to_remove = logits < threshold logits[indices_to_remove] = filter_value return logits
def sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We, current_output=None): special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) if current_output is None: current_output = [] last_utt = history[-1] last = tokenizer.decode(last_utt, skip_special_tokens=True) sentences = [last] # load sentences x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights rmpc = 1 # number of principal components to remove in SIF weighting scheme # set parameters global params params.rmpc = rmpc # get SIF embedding embedding1 = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i for i in range(args.max_length): instance, _ = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False) input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0) temperature = 1.0 top_k = 0 top_p = 0.9 logits = model(input_ids, token_type_ids=token_type_ids) if isinstance(logits, tuple): # for gpt2 and maybe others logits = logits[0] logits = logits[0, -1, :] / args.temperature logits = top_filtering(logits, words, weight4ind, We, tokenizer, history, args, params, embedding1, top_k=top_k, top_p=top_p, current_output=current_output) probs = F.softmax(logits, dim=-1) prev = torch.topk( probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1) if i < args.min_length and prev.item() in special_tokens_ids: while prev.item() in special_tokens_ids: if probs.max().item() == 1: warnings.warn( "Warning: model generating special token with probability 1." ) break # avoid infinitely looping over special token prev = torch.multinomial(probs, num_samples=1) if prev.item() in special_tokens_ids: break current_output.append(prev.item()) return current_output
model_100 = Word2Vec.load(os.path.join('/media/brx/TOSHIBA EXT/wiki_zh_word2vec/', 'ngram_100_5_90w.bin')) words = {} for index, word in enumerate(model_100.wv.index2entity): words[word] = index We = model_100.wv.vectors ''' # input wordfile = '../newsif/datafile/without_stopwords/word2vec_format.txt' # word vector file, can be downloaded from GloVe website weightfile = '../newsif/datafile/without_stopwords/words_count.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme sentences_test = ['会议充分肯定了2019年金融市场和信贷政策工作取得的成绩,在推动金融市场规范、创新、发展、开放,加大金融支持国家战略和重点领域。', '民营小微企业、精准扶贫力度,稳妥开展互联网金融风险专项整治以及房地产金融宏观审慎管理等方面做了大量卓有成效的工作。', '为实施稳健货币政策、防范化解重大金融风险、推动经济结构调整和转型升级提供了有力支撑。'] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sentences_test, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_core.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i print(embedding)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataname', default='t6', help='dataset name', choices=['t6', 't26', '2C']) parser.add_argument('-c', '--classifiername', default='RF', help='which classifier to use', choices=['GaussianNB', 'RF', 'SVM', 'KNN']) args = parser.parse_args() data_name = args.dataname # t6 or t26, 2C, 4C clf_name = args.classifiername # classfier # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter. embed_dims = [100] # can add 25, 50, 200 dimension if needed wordfile_list = [ '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims ] # each line is a word and its frequency weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt' # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] weightpara = 1e-3 # number of principal components to remove in SIF weighting scheme rmpc = 1 for wordfile, dim in zip(wordfile_list, embed_dims): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights # word2weight['str'] is the weight for the word 'str' word2weight = data_io.getWordWeight(weightfile, weightpara) # weight4ind[i] is the weight for the i-th word weight4ind = data_io.getWeight(words, word2weight) data_path = "../data/" if data_name == "t6": file_path = data_path + "CrisisLexT6_cleaned/" disasters = [ "sandy", "queensland", "boston", "west_texas", "oklahoma", "alberta" ] test_list = [ "{}_glove_token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "t26": file_path = data_path + "CrisisLexT26_cleaned/" disasters = [ "2012_Colorado_wildfires", "2013_Queensland_floods", "2013_Boston_bombings", "2013_West_Texas_explosion", "2013_Alberta_floods", "2013_Colorado_floods", "2013_NY_train_crash" ] test_list = [ "{}-tweets_labeled.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "2C": file_path = data_path + "2CTweets_cleaned/" disasters = [ "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston", "Brisbane", "Dublin", "London", "Sydney" ] test_list = [ "{}2C.csv.token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}2C_training.csv".format(disaster) for disaster in disasters ] accu_list = [] roc_list = [] precision_list = [] recall_list = [] f1_list = [] for train, test in zip(train_list, test_list): train_file = os.path.join(file_path, train) test_file = os.path.join(file_path, test) xtrain, ytrain = load_data(data_name, train_file) xtest, ytest = load_data(data_name, test_file) # load train # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location xtrain_windx, m_train = data_io.sentences2idx(xtrain, words) w_train = data_io.seq2weight(xtrain_windx, m_train, weight4ind) # get word weights # set parameters paramss = params.params() paramss.rmpc = rmpc # get SIF embedding train_embed = SIF_embedding.SIF_embedding( We, xtrain_windx, w_train, paramss) # embedding[i,:] is the embedding for sentence i # load target # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location xtest_windx, m_test = data_io.sentences2idx(xtest, words) # get word weights w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind) # set parameters paramsss = params.params() paramsss.rmpc = rmpc # get SIF embedding test_embed = SIF_embedding.SIF_embedding( We, xtest_windx, w_test, paramsss) # embedding[i,:] is the embedding for sentence i print(test) accu, roc, precision, recall, f1 = run_classifier( train_embed, ytrain, test_embed, ytest, clf_name, 100) accu_list.append(accu) roc_list.append(roc) precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print("{}_SIF_{}_LOO_accuracy {}".format(data_name, clf_name + str(dim), accu_list)) print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim), roc_list)) print("{}_SIF_{}_LOO_precision {}".format(data_name, clf_name + str(dim), precision_list)) print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim), recall_list)) print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim), f1_list)) print( "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}" .format(data_name, clf_name + str(dim), np.mean(accu_list), np.std(accu_list), np.mean(roc_list), np.std(roc_list), np.mean(f1_list), np.std(f1_list), np.mean(precision_list), np.std(precision_list), np.mean(recall_list), np.std(recall_list)))
for index, word in enumerate(model_300.wv.index2entity): words[word] = index We = model_300.wv.vectors weightpara = 1e-3 rmpc = 1 sentences = [ 'this is an example sentence', 'this is another sentence that is slightly longer' ] # load word vectors # (words, We) = data_io.getWordmap(wordfile) # load word weights # word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' word2weight = data_io.getWordWeight(model_300.wv.vocab, weightpara) weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx( sentences, words) # x is the array of word indices, m is the binary mask indicating # whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i
def transform(self, We, sentences): x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, self.weight4ind) # get word weights weighted_emb = get_weighted_average(We, x, w) # now use the model we've already loaded return self.remove_pc(weighted_emb)
def get_embedding(self, sentences, language='Chinese', weightpara=1e-3): """ This function return the embeddings for all sentences in the input parameter: sentences sentences is a list of sentencs need for SIF embeddings """ if language == 'Chinese': # word vector file # For model2: # wordfile = # For model1: # wordfile='../models/wiki_news_word_vector_small2.txt' # word frequency file # weightfile = words = self.words_chi word2weight = self.word2weight_chi We = self.We_chi else: # for english use: # wordfile = # wordfile='../models/glove.840B.300d.txt' # weightfile = # weightpara = 1e-5 # weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] words = self.words_eng word2weight = self.word2weight_eng We = self.We_eng rmpc = 1 # number of principal components to remove in SIF weighting scheme weight4ind = data_io.getWeight(words, word2weight) print('weight4ind finished ') # load sentences if language == 'Chinese': x, m = data_io.sentences2idx_c(sentences, words) else: x, m = data_io.sentences2idx(sentences, words) # print (x.shape) # (句子的数量,最长的句子的单词的数量) # print (m.shape) # (句子的数量,最长的句子的单词的数量) print('sentences2idx finished ') w = data_io.seq2weight(x, m, weight4ind) # get word weights print('seq2weight finished ') # set parameters param = params.params() param.rmpc = rmpc # get SIF embedding """ return 所有需要计算similarity的title,全文,句子的embedding。 paper 里面用的是TruncatedSVD,project 要求我们用PCA方法decomposite """ print('embedding start ') embedding = SIF_embedding.SIF_embedding( We, x, w, param, method='PCA') # embedding[i,:] is the embedding for sentence i print('embedding finished ') print(embedding.shape) return embedding
# srcsent = ['Pada mulanya, waktu Allah mulai menciptakan alam semesta'] # tgtsent = ['God saw the light, and saw that it was good. God divided the light from the darkness.'] # params = params.params() weightpara = 1e-3 rmpc = 1 # def srcEmbedding(srcWordFilePath, srcsent): src_model_300 = gensim.models.KeyedVectors.load_word2vec_format( srcWordFilePath, binary=False) srcwords = {} for index, word in enumerate(src_model_300.wv.index2entity): srcwords[word] = index srcWe = src_model_300.wv.vectors srcword2weight = data_io.getWordWeight(src_model_300.wv.vocab, weightpara) srcweight4ind = data_io.getWeight(srcwords, srcword2weight) srcx, srcm = data_io.sentences2idx(srcsent, srcwords) srcw = data_io.seq2weight(srcx, srcm, srcweight4ind) srcparams = params.params() srcparams.rmpc = rmpc srcEmbedding = SIF_embedding.SIF_embedding(srcWe, srcx, srcw, srcparams) # return embedding # def tgtEmbedding(tgtWordFilePath, tgtsent): tgtmodel_300 = gensim.models.KeyedVectors.load_word2vec_format(tgtWordFilePath, binary=False) tgtwords = {} for index, word in enumerate(tgtmodel_300.wv.index2entity): tgtwords[word] = index tgtWe = tgtmodel_300.wv.vectors tgtword2weight = data_io.getWordWeight(tgtmodel_300.wv.vocab, weightpara) tgtweight4ind = data_io.getWeight(tgtwords, tgtword2weight)
(glove_words, We) = data_io.getWordmap(wordfile) print("shape of Word embedding is: " + str(We.shape)) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( glove_words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters params = params.params() params.rmpc = rmpc # load sentences print("reading the input sentences now & converting to indices .. \n") sample_sents = read_NMT_data.read_data(sample_ara) # AraSIF embedding for sample sentences print("computing AraSIF embedding now ...\n") # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx(sample_sents, glove_words) w = data_io.seq2weight(x, m, weight4ind) # get word weights sample_embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i print("shape of sample sentence embedding is: " + str(sample_embedding.shape)) # serialize for future use numpy.save('sample_sentence_embedding.npy', sample_embedding)
line = line.strip() line = line.split(':')[1] sentences.append(line) glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt' # word vector file, can be downloaded from GloVe website word_freauency_path = './douban_cropus_vocab.txt' # each line is a word and its frequency weightpara = 1e-3 rmpc = 1 # load word vectors (Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path) # load word weights word2weight = data_io.getWordWeight( word_freauency_path, weightpara) # word2weight['str'] is the weight for the word 'str' Index2Weight = data_io.getWeight( Word2Indx, word2weight) # weight4ind[i] is the weight for the i-th word word_idx_seq_of_sentence, mask = data_io.sentences2idx( sentences, Word2Indx ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters params = params.params() params.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, params) np.save("douban_sentence2vector.npy", embedding)