def main(sentences, wordfile: str, weightfile: str, weightpara: float = 1e-3, rmpc: int = 1): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m, _ = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i
def load_model(self): sys.path.append('../src') weightpara = 1e-5 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] print('读取中文模型') self.words_chi, self.We_chi = data_io.getWordmap( '../models/wiki_news_model2_vector.txt') self.word2weight_chi = data_io.getWordWeight( '../models/word_count.txt', # each line is a word and its frequency, weightpara) # word2weight['str'] is the weight for the word 'str' print('中文模型读取完毕') print('读取英文模型') weightpara = 1e-3 self.words_eng, self.We_eng = data_io.getWordmap( '../models/glove_large.txt') self.word2weight_eng = data_io.getWordWeight( '../models/enwiki_vocab_min200.txt', # each line is a word and its frequency weightpara) # word2weight['str'] is the weight for the word 'str' print('英文模型读取完毕')
def load_embeddings(wordfile, weightfile, weightpara=5e-4, word2vec=False): if word2vec: (words, We) = getWordmapWord2Vec(wordfile) else: (words, We) = data_io.getWordmap(wordfile) word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word return words, We, weight4ind
def get_embs(sentences, params): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def get_sif(dataset): wordfile = '../data/glove.6B.50d.txt' # word vector file, can be downloaded from GloVe website weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 2.7e-4 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 0 # number of principal components to remove in SIF weighting scheme # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word param = params.params() param.rmpc = rmpc sentence_embedding_all = get_sentences_embedding(dataset, words, weight4ind, param, We) # sentence_embedding_all = turn2std(sentence_embedding_all) # 将矩阵转换为标准矩阵 return sentence_embedding_all
def vectorize_sif(filename): class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(filename, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def sif_embedding(sen): import sys #sys.path.append("../src") #sys.path.append("../data") import data_io, params, SIF_embedding import params import SIF_embedding # input wordfile = 'data/dic_files.txt' # word vector file, can be downloaded from GloVe website weightfile = 'data/dic_freq.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # sentences = ['这是一个例句', '这是一个更长一些的例句'] # sentences = ['昨天天气不错', '这是一个更长一些的例句'] sentences = sen # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer'] # load word vectors (words, We) = data_io.getWordmap(wordfile) # print(words,We) #单词,和词向量 # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location # print(x,m) w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('word weight:',w) # set parameters # params = params.params() params = params.params_all() # name 'params' is not defined params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def load_model(): wordfile = "glove path (glove.840B.300d.txt file)" # you can download glove from https://www.kaggle.com/takuok/glove840b300dtxt weightfile = artifact_path + '/SIF/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme (words, We) = data_io.getWordmap(wordfile) a = list(words.keys()) for i, v in enumerate(a): words[v.decode("utf-8")] = words.pop(v) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word return (words, weight4ind, rmpc, We)
def load_embed(wordfile, weightfile, weightpara=1e-3, param=None, rmpc=0): ''' wordfile: : location of embedding data (e.g., glove embedings) weightfile: : location of TF data for words weightpara: : the parameter in the SIF weighting scheme, usually in range [3e-5, 3e-3] rmpc: : number of principal components to remove in SIF weighting scheme ''' # input wordfile = '/home/francisco/GitHub/SIF/data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website weightfile = '/home/francisco/GitHub/SIF/auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency # load word vectors (words, Weights) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters param.rmpc = rmpc return Weights, words, word2weight, weight4ind
def get_sent_vec(sentences): import params # 详见data_io.py (words, We) = data_io.getWordmap(wordfile) # 详见data_io.py word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec import sys import gensim import data_io import os from scipy.stats import spearmanr filename = sys.argv[1] dataset = sys.argv[2] f_base = os.path.splitext(os.path.basename(filename))[0] d_base = os.path.splitext(os.path.basename(dataset))[0] (words, We) = data_io.getWordmap(filename) wordsim = pd.read_csv(dataset,delimiter=';', names=['word1','word2','sim'], index_col=None) similarities = [] i=0 tot=0 for index, row in wordsim.iterrows(): try: similarity = numpy.dot(We[words[row['word1']]], We[words[row['word2']]])/(numpy.linalg.norm(We[words[row['word1']]])* numpy.linalg.norm(We[words[row['word2']]])) tot+=1 except KeyError: similarity = numpy.nan i+=1 similarities.append(similarity)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) #personality = random.choice(personalities) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) wordfile = './data/truncate.txt' # word vector file, can be downloaded from GloVe website weightfile = './auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word p = 0 start_time = time.time() with open('data_volunteers.json') as json_file: json_data = json.load(json_file) for i in json_data: p += 1 #if p <1100: # continue history = [] personality = [] query_set = [] json_dialog = i["dialog"] json_bot = i["bot_profile"] for j in json_bot: personality.append(tokenizer.encode(j)) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) persona = tokenizer.decode(chain(*personality)) row = {"Personality": persona} text = [] for j in json_dialog: if j["sender_class"] == "Human": json_text = j["text"] raw_text = json_text check = tokenizer.decode(tokenizer.encode(raw_text), skip_special_tokens=True) if check == "": history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = normal_sample_sequence( personality, history, tokenizer, model, args) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) continue history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) else: json_text = j["text"] raw_text = json_text history.append(tokenizer.encode(raw_text)) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text }) row["dialog"] = text query_set.append(row) #print(query_set) with open('./sif_set/sif' + str(p) + '.json', 'w', encoding='utf-8') as make_file: json.dump(query_set, make_file) if not p % 10: print( str(p * 100 / 1111) + '%, ' + str(time.time() - start_time) + 'sec') '''
if params.clip == 0: params.clip = None params.minval = args.minval params.maxval = args.maxval if args.nonlinearity: if args.nonlinearity == 1: params.nonlinearity = lasagne.nonlinearities.linear if args.nonlinearity == 2: params.nonlinearity = lasagne.nonlinearities.tanh if args.nonlinearity == 3: params.nonlinearity = lasagne.nonlinearities.rectify if args.nonlinearity == 4: params.nonlinearity = lasagne.nonlinearities.sigmoid # load data (words, We) = data_io.getWordmap(params.wordfile) if args.task == "sim" or args.task == "ent": train_data = data_io.getSimEntDataset(params.traindata, words, params.task) elif args.task == "sentiment": train_data = data_io.getSentimentDataset(params.traindata, words) else: raise ValueError('Task should be ent, sim, or sentiment.') # load weight if params.weightfile: word2weight = data_io.getWordWeight(params.weightfile, params.weightpara) params.weight4ind = data_io.getWeight(words, word2weight) print( ('word weights computed using parameter a=' + str(params.weightpara))) else: params.weight4ind = []
""" import datapre import data_io,params import Embedding Wordweight_file='weight/word_weight_3a.txt' Clauseweight_file='weight/Clause_weight.txt' Phraseweight_file='weight/Phrase_weight.txt' word_weight=datapre.TreeNode_Weight(Wordweight_file) clause_weight=datapre.TreeNode_Weight(Clauseweight_file) phrase_weight=datapre.TreeNode_Weight(Phraseweight_file) wordfile = 'wordvector/glove.6B.50d.txt' # word vector file, can be downloaded from GloVe website (words, word_emb) = data_io.getWordmap(wordfile) ########################################### prefix = "datapre/" farr1 = [ "MSRpar2012-1.txt", #"MSRpar2012-2.txt", "MSRvid2012-1.txt", #"MSRvid2012-2.txt", "OnWN2012-1.txt", #"OnWN2012-2.txt", "OnWN2013-1.txt", #"OnWN2013-2.txt", "OnWN2014-1.txt", #"OnWN2014-2.txt",
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataname', default='t6', help='dataset name', choices=['t6', 't26', '2C']) parser.add_argument('-c', '--classifiername', default='RF', help='which classifier to use', choices=['GaussianNB', 'RF', 'SVM', 'KNN']) args = parser.parse_args() data_name = args.dataname # t6 or t26, 2C, 4C clf_name = args.classifiername # classfier # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter. embed_dims = [100] # can add 25, 50, 200 dimension if needed wordfile_list = [ '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims ] # each line is a word and its frequency weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt' # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] weightpara = 1e-3 # number of principal components to remove in SIF weighting scheme rmpc = 1 for wordfile, dim in zip(wordfile_list, embed_dims): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights # word2weight['str'] is the weight for the word 'str' word2weight = data_io.getWordWeight(weightfile, weightpara) # weight4ind[i] is the weight for the i-th word weight4ind = data_io.getWeight(words, word2weight) data_path = "../data/" if data_name == "t6": file_path = data_path + "CrisisLexT6_cleaned/" disasters = [ "sandy", "queensland", "boston", "west_texas", "oklahoma", "alberta" ] test_list = [ "{}_glove_token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "t26": file_path = data_path + "CrisisLexT26_cleaned/" disasters = [ "2012_Colorado_wildfires", "2013_Queensland_floods", "2013_Boston_bombings", "2013_West_Texas_explosion", "2013_Alberta_floods", "2013_Colorado_floods", "2013_NY_train_crash" ] test_list = [ "{}-tweets_labeled.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "2C": file_path = data_path + "2CTweets_cleaned/" disasters = [ "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston", "Brisbane", "Dublin", "London", "Sydney" ] test_list = [ "{}2C.csv.token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}2C_training.csv".format(disaster) for disaster in disasters ] accu_list = [] roc_list = [] precision_list = [] recall_list = [] f1_list = [] for train, test in zip(train_list, test_list): train_file = os.path.join(file_path, train) test_file = os.path.join(file_path, test) xtrain, ytrain = load_data(data_name, train_file) xtest, ytest = load_data(data_name, test_file) # load train # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location xtrain_windx, m_train = data_io.sentences2idx(xtrain, words) w_train = data_io.seq2weight(xtrain_windx, m_train, weight4ind) # get word weights # set parameters paramss = params.params() paramss.rmpc = rmpc # get SIF embedding train_embed = SIF_embedding.SIF_embedding( We, xtrain_windx, w_train, paramss) # embedding[i,:] is the embedding for sentence i # load target # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location xtest_windx, m_test = data_io.sentences2idx(xtest, words) # get word weights w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind) # set parameters paramsss = params.params() paramsss.rmpc = rmpc # get SIF embedding test_embed = SIF_embedding.SIF_embedding( We, xtest_windx, w_test, paramsss) # embedding[i,:] is the embedding for sentence i print(test) accu, roc, precision, recall, f1 = run_classifier( train_embed, ytrain, test_embed, ytest, clf_name, 100) accu_list.append(accu) roc_list.append(roc) precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print("{}_SIF_{}_LOO_accuracy {}".format(data_name, clf_name + str(dim), accu_list)) print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim), roc_list)) print("{}_SIF_{}_LOO_precision {}".format(data_name, clf_name + str(dim), precision_list)) print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim), recall_list)) print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim), f1_list)) print( "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}" .format(data_name, clf_name + str(dim), np.mean(accu_list), np.std(accu_list), np.mean(roc_list), np.std(roc_list), np.mean(f1_list), np.std(f1_list), np.mean(precision_list), np.std(precision_list), np.mean(recall_list), np.std(recall_list)))
'-lr', action='store', default='0.01', help='Learning rate.') parser.add_argument('--iterations', '-i', action='store', default=250, help=('Number of iterations.')) args = parser.parse_args() pretrained_vectors = args.pretrained_vectors (words, weights) = data_io.getWordmap(pretrained_vectors) print weights.shape initial_embeddings = {v: weights[words[v]] for v in words} my_vocabulary = open(args.my_vocabulary, 'r') vocab = my_vocabulary.read().split('\n') vocab_len = len(vocab) - 1 print "Reading co-occurrence matrix..." data = np.genfromtxt(args.my_coo_matrix, names=True, dtype=None, delimiter=',') my_coo_matrix = sparse.coo_matrix( (data['cooccurrence'], (data['word_a'], data['word_b'])), shape=(vocab_len, vocab_len)) print "Converting co-occurence matrix to csr format..."
def SIF_master(segfile, cleanfile, directory, summ_ind): print "segfile: ", segfile print "clean file: ", cleanfile #cleanfile = cleanfile+".ls" class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(cleanfile, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i #segfile = segfile+".segs" f = open(segfile).readlines() indexes = [] matches = [] for item in f: ind = item.rfind("&") indexes.append(item[:ind + 1]) if len(indexes) == len(embedding): for ind in range(0, len(indexes)): lines = indexes[ind] + str(list(embedding[ind])) matches.append(lines) else: print "length doesn't match!! Check if there is empty line!!" #fname = directory +'/'+str(summ_ind)+ '/' + getRealName(segfile) + '.ls' #fname = directory +'/'+str(summ_ind)+ '/' + segfile + '.ls' fname = directory + '/' + str(summ_ind) + '/' + getRealName(segfile) print fname with open(fname + ".ls", "w") as file: for item in matches: file.write(item + "\n") return embedding
header=0, delimiter="\t", quoting=3) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'testData.tsv'), header=0, delimiter="\t", quoting=3) #Data Leak test["sentiment"] = test["id"].map( lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0) y_test = test["sentiment"] vectors = sys.argv[1] (words, We) = data_io.getWordmap(vectors) num_features = We.shape[1] p, trainDataVecs = getAvgFeatureVecs(getCleanReviews(train), We, words, num_features) print('Train: {0} '.format(p)) p, testDataVecs = getAvgFeatureVecs(getCleanReviews(test), We, words, num_features) print('Test: {0} '.format(p)) log_reg = LogisticRegression() print "Fitting a logistic regression model to labeled training data..." log_reg = log_reg.fit(trainDataVecs, train["sentiment"])
import SIF_embedding import read_NMT_data # input arabic file sample_ara = '../NMT_data/sample.ara' # to compute sif embeddings for all sentences in this file # Arabic GloVe embedding pre-trained model wordfile = '../models/glove_full_grams_sg_300_wiki.txt' weightfile = '../AraSIF_word_counts/arwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # load word vectors print("Reading embedding matrix. Hang on! this will take a while ...") (glove_words, We) = data_io.getWordmap(wordfile) print("shape of Word embedding is: " + str(We.shape)) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( glove_words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters params = params.params() params.rmpc = rmpc # load sentences print("reading the input sentences now & converting to indices .. \n")
douban_cropus_path = '/bigdata/xiaoma/Assi12/douban.txt' sentences = [] with open(douban_cropus_path) as f: for line in f: line = line.strip() line = line.split(':')[1] sentences.append(line) glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt' # word vector file, can be downloaded from GloVe website word_freauency_path = './douban_cropus_vocab.txt' # each line is a word and its frequency weightpara = 1e-3 rmpc = 1 # load word vectors (Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path) # load word weights word2weight = data_io.getWordWeight( word_freauency_path, weightpara) # word2weight['str'] is the weight for the word 'str' Index2Weight = data_io.getWeight( Word2Indx, word2weight) # weight4ind[i] is the weight for the i-th word word_idx_seq_of_sentence, mask = data_io.sentences2idx( sentences, Word2Indx ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters params = params.params()