def main(sentences, wordfile: str, weightfile: str, weightpara: float = 1e-3, rmpc: int = 1): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m, _ = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i
def SIFDocEmbedding(w2vdict, weighttxt, txtfile): weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] (words, We) = getWordMap(w2vdict) word2weight = data_io.getWordWeight( weighttxt, word2weight_pickle_file, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word DocVectorDict = {} DocSentVecDict = {} docNum = 0 with open(txtfile, 'r') as reader: txt = reader.readlines() reader.close() for doc in txt: doc = doc.strip() sentEm = SIFSentEmbedding(weighttxt, doc, words, We, weight4ind, weightpara=1e-3, paramm=1) DocSentVecDict[docNum] = sentEm docVector = (np.sum(sentEm, axis=1)) / (sentEm.shape[0]) DocVectorDict[docNum] = docVector docNum += 1 return DocVectorDict, DocSentVecDict, We
def load_embeddings(wordfile, weightfile, weightpara=5e-4, word2vec=False): if word2vec: (words, We) = getWordmapWord2Vec(wordfile) else: (words, We) = data_io.getWordmap(wordfile) word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word return words, We, weight4ind
def get_embs(sentences, params): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def __init__(self): self.weightfile = config.url_enwiki self.weightpara = 1e-3 print("Getting embeddings from the Glove pickles") with open(config.url_glove_pickle_we_1, "rb") as file: We_1 = pickle.load(file) with open(config.url_glove_pickle_words_1, "rb") as file: words_1 = pickle.load(file) with open(config.url_glove_pickle_we_2, "rb") as file: We_2 = pickle.load(file) with open(config.url_glove_pickle_words_2, "rb") as file: words_2 = pickle.load(file) with open(config.url_glove_pickle_we_3, "rb") as file: We_3 = pickle.load(file) with open(config.url_glove_pickle_words_3, "rb") as file: words_3 = pickle.load(file) self.We = [] self.We.extend(We_1) self.We.extend(We_2) self.We.extend(We_3) self.words = {} self.words.update(words_1) self.words.update(words_2) self.words.update(words_3) with open(config.url_snli_pc1, "rb") as file: self.snli_pc_1 = pickle.load(file) with open(config.url_snli_pc2, "rb") as file: self.snli_pc_2 = pickle.load(file) print("Successfully got the embeddings from the pickle") self.word2weight = data_io.getWordWeight( self.weightfile, self.weightpara ) # word2weight['str'] is the weight for the word 'str' self.weight4ind = data_io.getWeight( self.words, self.word2weight) # weight4ind[i] is the weight for the i-th word
def get_sif(dataset): wordfile = '../data/glove.6B.50d.txt' # word vector file, can be downloaded from GloVe website weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 2.7e-4 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 0 # number of principal components to remove in SIF weighting scheme # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word param = params.params() param.rmpc = rmpc sentence_embedding_all = get_sentences_embedding(dataset, words, weight4ind, param, We) # sentence_embedding_all = turn2std(sentence_embedding_all) # 将矩阵转换为标准矩阵 return sentence_embedding_all
def main(word_embeddings_path, word_weight_path, out_dir): wordfile = word_embeddings_path weightfile = word_weight_path weightparas = [1e-2, 1e-3, 1e-4] (words, We) = getWordmap(wordfile) vector_file = open(os.path.join(out_dir, "vectors"), "w") pickle.dump(We, vector_file) words_file = open(os.path.join(out_dir, "words"), "w") pickle.dump(words, words_file) for weightpara in weightparas: print("calculating word2weight with a = {}.".format(weightpara)) word2weight = data_io.getWordWeight(weightfile, weightpara) print("calculating weight4ind with a = {}.".format(weightpara)) weight4ind = data_io.getWeight(words, word2weight) weight4ind_file = open( os.path.join(out_dir, "weight4ind_weightpara_%.E" % Decimal(weightpara)), 'w') pickle.dump(weight4ind, weight4ind_file)
def sif_embedding(sen): import sys #sys.path.append("../src") #sys.path.append("../data") import data_io, params, SIF_embedding import params import SIF_embedding # input wordfile = 'data/dic_files.txt' # word vector file, can be downloaded from GloVe website weightfile = 'data/dic_freq.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # sentences = ['这是一个例句', '这是一个更长一些的例句'] # sentences = ['昨天天气不错', '这是一个更长一些的例句'] sentences = sen # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer'] # load word vectors (words, We) = data_io.getWordmap(wordfile) # print(words,We) #单词,和词向量 # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location # print(x,m) w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('word weight:',w) # set parameters # params = params.params() params = params.params_all() # name 'params' is not defined params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def vectorize_sif(filename): class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(filename, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def load_model(): wordfile = "glove path (glove.840B.300d.txt file)" # you can download glove from https://www.kaggle.com/takuok/glove840b300dtxt weightfile = artifact_path + '/SIF/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme (words, We) = data_io.getWordmap(wordfile) a = list(words.keys()) for i, v in enumerate(a): words[v.decode("utf-8")] = words.pop(v) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word return (words, weight4ind, rmpc, We)
def load_embed(wordfile, weightfile, weightpara=1e-3, param=None, rmpc=0): ''' wordfile: : location of embedding data (e.g., glove embedings) weightfile: : location of TF data for words weightpara: : the parameter in the SIF weighting scheme, usually in range [3e-5, 3e-3] rmpc: : number of principal components to remove in SIF weighting scheme ''' # input wordfile = '/home/francisco/GitHub/SIF/data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website weightfile = '/home/francisco/GitHub/SIF/auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency # load word vectors (words, Weights) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters param.rmpc = rmpc return Weights, words, word2weight, weight4ind
def get_sent_vec(sentences): import params # 详见data_io.py (words, We) = data_io.getWordmap(wordfile) # 详见data_io.py word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
wordfiles = [ #'../data/paragram_sl999_small.txt', # need to download it from John Wieting's github (https://github.com/jwieting/iclr2016) '/Users/sherryruan/data/glove/glove.6B/glove.6B.300d.txt' # need to download it first ] weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' weightparas = [-1, 1e-3] #[-1,1e-1,1e-2,1e-3,1e-4] rmpcs = [0, 1] # [0,1,2] params = params.params() parr4para = {} sarr4para = {} for wordfile in wordfiles: (words, We) = data_io.getWordmap(wordfile) for weightpara in weightparas: word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) for rmpc in rmpcs: print('word vectors loaded from %s' % wordfile) print('word weights computed from %s using parameter a=%f' % (weightfile, weightpara)) params.rmpc = rmpc print('remove the first %d principal components' % rmpc) ## eval just one example dataset parr, sarr = eval.sim_evaluate_one( We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params) ## eval all datasets; need to obtained datasets from John Wieting (https://github.com/jwieting/iclr2016) # parr, sarr = eval.sim_evaluate_all(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params) paras = (wordfile, weightfile, weightpara, rmpc) parr4para[paras] = parr sarr4para[paras] = sarr
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) #personality = random.choice(personalities) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) wordfile = './data/truncate.txt' # word vector file, can be downloaded from GloVe website weightfile = './auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word p = 0 start_time = time.time() with open('data_volunteers.json') as json_file: json_data = json.load(json_file) for i in json_data: p += 1 #if p <1100: # continue history = [] personality = [] query_set = [] json_dialog = i["dialog"] json_bot = i["bot_profile"] for j in json_bot: personality.append(tokenizer.encode(j)) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) persona = tokenizer.decode(chain(*personality)) row = {"Personality": persona} text = [] for j in json_dialog: if j["sender_class"] == "Human": json_text = j["text"] raw_text = json_text check = tokenizer.decode(tokenizer.encode(raw_text), skip_special_tokens=True) if check == "": history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = normal_sample_sequence( personality, history, tokenizer, model, args) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) continue history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) else: json_text = j["text"] raw_text = json_text history.append(tokenizer.encode(raw_text)) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text }) row["dialog"] = text query_set.append(row) #print(query_set) with open('./sif_set/sif' + str(p) + '.json', 'w', encoding='utf-8') as make_file: json.dump(query_set, make_file) if not p % 10: print( str(p * 100 / 1111) + '%, ' + str(time.time() - start_time) + 'sec') '''
def get_embedding(self, sentences, language='Chinese', weightpara=1e-3): """ This function return the embeddings for all sentences in the input parameter: sentences sentences is a list of sentencs need for SIF embeddings """ if language == 'Chinese': # word vector file # For model2: # wordfile = # For model1: # wordfile='../models/wiki_news_word_vector_small2.txt' # word frequency file # weightfile = words = self.words_chi word2weight = self.word2weight_chi We = self.We_chi else: # for english use: # wordfile = # wordfile='../models/glove.840B.300d.txt' # weightfile = # weightpara = 1e-5 # weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] words = self.words_eng word2weight = self.word2weight_eng We = self.We_eng rmpc = 1 # number of principal components to remove in SIF weighting scheme weight4ind = data_io.getWeight(words, word2weight) print('weight4ind finished ') # load sentences if language == 'Chinese': x, m = data_io.sentences2idx_c(sentences, words) else: x, m = data_io.sentences2idx(sentences, words) # print (x.shape) # (句子的数量,最长的句子的单词的数量) # print (m.shape) # (句子的数量,最长的句子的单词的数量) print('sentences2idx finished ') w = data_io.seq2weight(x, m, weight4ind) # get word weights print('seq2weight finished ') # set parameters param = params.params() param.rmpc = rmpc # get SIF embedding """ return 所有需要计算similarity的title,全文,句子的embedding。 paper 里面用的是TruncatedSVD,project 要求我们用PCA方法decomposite """ print('embedding start ') embedding = SIF_embedding.SIF_embedding( We, x, w, param, method='PCA') # embedding[i,:] is the embedding for sentence i print('embedding finished ') print(embedding.shape) return embedding
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataname', default='t6', help='dataset name', choices=['t6', 't26', '2C']) parser.add_argument('-c', '--classifiername', default='RF', help='which classifier to use', choices=['GaussianNB', 'RF', 'SVM', 'KNN']) args = parser.parse_args() data_name = args.dataname # t6 or t26, 2C, 4C clf_name = args.classifiername # classfier # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter. embed_dims = [100] # can add 25, 50, 200 dimension if needed wordfile_list = [ '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims ] # each line is a word and its frequency weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt' # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] weightpara = 1e-3 # number of principal components to remove in SIF weighting scheme rmpc = 1 for wordfile, dim in zip(wordfile_list, embed_dims): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights # word2weight['str'] is the weight for the word 'str' word2weight = data_io.getWordWeight(weightfile, weightpara) # weight4ind[i] is the weight for the i-th word weight4ind = data_io.getWeight(words, word2weight) data_path = "../data/" if data_name == "t6": file_path = data_path + "CrisisLexT6_cleaned/" disasters = [ "sandy", "queensland", "boston", "west_texas", "oklahoma", "alberta" ] test_list = [ "{}_glove_token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "t26": file_path = data_path + "CrisisLexT26_cleaned/" disasters = [ "2012_Colorado_wildfires", "2013_Queensland_floods", "2013_Boston_bombings", "2013_West_Texas_explosion", "2013_Alberta_floods", "2013_Colorado_floods", "2013_NY_train_crash" ] test_list = [ "{}-tweets_labeled.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "2C": file_path = data_path + "2CTweets_cleaned/" disasters = [ "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston", "Brisbane", "Dublin", "London", "Sydney" ] test_list = [ "{}2C.csv.token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}2C_training.csv".format(disaster) for disaster in disasters ] accu_list = [] roc_list = [] precision_list = [] recall_list = [] f1_list = [] for train, test in zip(train_list, test_list): train_file = os.path.join(file_path, train) test_file = os.path.join(file_path, test) xtrain, ytrain = load_data(data_name, train_file) xtest, ytest = load_data(data_name, test_file) # load train # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location xtrain_windx, m_train = data_io.sentences2idx(xtrain, words) w_train = data_io.seq2weight(xtrain_windx, m_train, weight4ind) # get word weights # set parameters paramss = params.params() paramss.rmpc = rmpc # get SIF embedding train_embed = SIF_embedding.SIF_embedding( We, xtrain_windx, w_train, paramss) # embedding[i,:] is the embedding for sentence i # load target # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location xtest_windx, m_test = data_io.sentences2idx(xtest, words) # get word weights w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind) # set parameters paramsss = params.params() paramsss.rmpc = rmpc # get SIF embedding test_embed = SIF_embedding.SIF_embedding( We, xtest_windx, w_test, paramsss) # embedding[i,:] is the embedding for sentence i print(test) accu, roc, precision, recall, f1 = run_classifier( train_embed, ytrain, test_embed, ytest, clf_name, 100) accu_list.append(accu) roc_list.append(roc) precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print("{}_SIF_{}_LOO_accuracy {}".format(data_name, clf_name + str(dim), accu_list)) print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim), roc_list)) print("{}_SIF_{}_LOO_precision {}".format(data_name, clf_name + str(dim), precision_list)) print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim), recall_list)) print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim), f1_list)) print( "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}" .format(data_name, clf_name + str(dim), np.mean(accu_list), np.std(accu_list), np.mean(roc_list), np.std(roc_list), np.mean(f1_list), np.std(f1_list), np.mean(precision_list), np.std(precision_list), np.mean(recall_list), np.std(recall_list)))
def SIF_master(segfile, cleanfile, directory, summ_ind): print "segfile: ", segfile print "clean file: ", cleanfile #cleanfile = cleanfile+".ls" class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(cleanfile, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i #segfile = segfile+".segs" f = open(segfile).readlines() indexes = [] matches = [] for item in f: ind = item.rfind("&") indexes.append(item[:ind + 1]) if len(indexes) == len(embedding): for ind in range(0, len(indexes)): lines = indexes[ind] + str(list(embedding[ind])) matches.append(lines) else: print "length doesn't match!! Check if there is empty line!!" #fname = directory +'/'+str(summ_ind)+ '/' + getRealName(segfile) + '.ls' #fname = directory +'/'+str(summ_ind)+ '/' + segfile + '.ls' fname = directory + '/' + str(summ_ind) + '/' + getRealName(segfile) print fname with open(fname + ".ls", "w") as file: for item in matches: file.write(item + "\n") return embedding
# srcsent = ['Pada mulanya, waktu Allah mulai menciptakan alam semesta'] # tgtsent = ['God saw the light, and saw that it was good. God divided the light from the darkness.'] # params = params.params() weightpara = 1e-3 rmpc = 1 # def srcEmbedding(srcWordFilePath, srcsent): src_model_300 = gensim.models.KeyedVectors.load_word2vec_format( srcWordFilePath, binary=False) srcwords = {} for index, word in enumerate(src_model_300.wv.index2entity): srcwords[word] = index srcWe = src_model_300.wv.vectors srcword2weight = data_io.getWordWeight(src_model_300.wv.vocab, weightpara) srcweight4ind = data_io.getWeight(srcwords, srcword2weight) srcx, srcm = data_io.sentences2idx(srcsent, srcwords) srcw = data_io.seq2weight(srcx, srcm, srcweight4ind) srcparams = params.params() srcparams.rmpc = rmpc srcEmbedding = SIF_embedding.SIF_embedding(srcWe, srcx, srcw, srcparams) # return embedding # def tgtEmbedding(tgtWordFilePath, tgtsent): tgtmodel_300 = gensim.models.KeyedVectors.load_word2vec_format(tgtWordFilePath, binary=False) tgtwords = {} for index, word in enumerate(tgtmodel_300.wv.index2entity): tgtwords[word] = index tgtWe = tgtmodel_300.wv.vectors tgtword2weight = data_io.getWordWeight(tgtmodel_300.wv.vocab, weightpara)
wordfile = '../models/glove_full_grams_sg_300_wiki.txt' weightfile = '../AraSIF_word_counts/arwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # load word vectors print("Reading embedding matrix. Hang on! this will take a while ...") (glove_words, We) = data_io.getWordmap(wordfile) print("shape of Word embedding is: " + str(We.shape)) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( glove_words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters params = params.params() params.rmpc = rmpc # load sentences print("reading the input sentences now & converting to indices .. \n") sample_sents = read_NMT_data.read_data(sample_ara) # AraSIF embedding for sample sentences print("computing AraSIF embedding now ...\n") # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx(sample_sents, glove_words) w = data_io.seq2weight(x, m, weight4ind) # get word weights
line = line.strip() line = line.split(':')[1] sentences.append(line) glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt' # word vector file, can be downloaded from GloVe website word_freauency_path = './douban_cropus_vocab.txt' # each line is a word and its frequency weightpara = 1e-3 rmpc = 1 # load word vectors (Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path) # load word weights word2weight = data_io.getWordWeight( word_freauency_path, weightpara) # word2weight['str'] is the weight for the word 'str' Index2Weight = data_io.getWeight( Word2Indx, word2weight) # weight4ind[i] is the weight for the i-th word word_idx_seq_of_sentence, mask = data_io.sentences2idx( sentences, Word2Indx ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters params = params.params() params.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, params) np.save("douban_sentence2vector.npy", embedding)