def translate(original_w2v='original_w2v', new_w2v='new_w2v', from_docfreqs='from.npy', to_docfreqs='to.npy'): wn = w2v() wn.load_minimal(new_w2v) n_words = len(wn.model.syn0) to_d = np.zeros(n_words) wo = w2v() wo.load_minimal(original_w2v) f = open(from_docfreqs, 'rb') from_d = np.load(f) f.close() #translate print 'Begin translation...' success = 0 index = 0 for word in wn.model.vocab: i = wn.model.vocab[word].index if wo.exists_word(word): j = wo.model.vocab[word].index to_d[i] = from_d[j] success += 1 else: to_d[i] = 1.0 print 'Success rate: ' + str(float(success)/len(wn.model.vocab)) print 'Saving...' f = open(to_docfreqs, 'wb') np.save(f, to_d) f.close() print 'Done.'
def translate(original_w2v='original_w2v', new_w2v='new_w2v', from_docfreqs='from.npy', to_docfreqs='to.npy'): wn = w2v() wn.load_minimal(new_w2v) n_words = len(wn.model.syn0) to_d = np.zeros(n_words) wo = w2v() wo.load_minimal(original_w2v) f = open(from_docfreqs, 'rb') from_d = np.load(f) f.close() #translate print 'Begin translation...' success = 0 index = 0 for word in wn.model.vocab: i = wn.model.vocab[word].index if wo.exists_word(word): j = wo.model.vocab[word].index to_d[i] = from_d[j] success += 1 else: to_d[i] = 1.0 print 'Success rate: ' + str(float(success) / len(wn.model.vocab)) print 'Saving...' f = open(to_docfreqs, 'wb') np.save(f, to_d) f.close() print 'Done.'
def __init__(self, pairs_filename='pairs.txt', no_pairs_filename='no_pairs.txt', docfreq_filename='docfreqs.npy', w2v_filename='minimal', no_words=20, embedding_dim=400, batch_size=100): super(PairProcessor, self).__init__() self.pairs_filename = pairs_filename self.no_pairs_filename = no_pairs_filename self.batch_size = batch_size self.no_words = no_words self.embedding_dim = embedding_dim self.docfreq_filename = docfreq_filename self.w2v_filename = w2v_filename self.x1 = np.zeros((batch_size, embedding_dim, no_words), dtype=theano.config.floatX) self.x2 = np.zeros((batch_size, embedding_dim, no_words), dtype=theano.config.floatX) self.y = np.zeros((batch_size), dtype=theano.config.floatX) self.z = np.zeros((batch_size), dtype=theano.config.floatX) f = open(self.docfreq_filename) self.docfreqs = np.load(f) f.close() if isinstance(w2v_filename, basestring): self.w = w2v() self.w.load_minimal(self.w2v_filename) else: self.w = w2v_filename
def Create_Vectorizer(self, name, k, cat): if (name == 'CountVec'): return CountVectorizer( analyzer="word", stop_words=nltk.corpus.stopwords.words('portuguese'), max_features=5000) elif (name == 'NGram'): return CountVectorizer(analyzer="char", ngram_range=([3, 16]), tokenizer=None, preprocessor=None, max_features=3000) elif (name == 'TFidf'): return TfidfVectorizer( min_df=2, stop_words=nltk.corpus.stopwords.words('portuguese')) elif (name == 'selecao'): return Selecao(k, cat) elif (name == 'w2v'): return w2v() elif (name == 'w2v_mean'): return w2v_mean() else: raise NameError('Vectorizer not found')
def calculate(corpus='enwiki.txt', w2v_minimal_model='minimal', output_file='docfreq.npy'): w = w2v() w.load_minimal(w2v_minimal_model) n_words = len(w.model.vocab) freqs = np.zeros(n_words) i = 0 with open(corpus) as f: for line in f: if i % 10000 == 0: print "Processing line " + str(i) + "..." i += 1 words = set(line.split()) for word in words: if w.exists_word(word): freqs[w.model.vocab[word].index] += 1.0 print 'Saving...' f = open(output_file, 'wb') np.save(f, freqs) f.close() print 'Done.'
def extract_random(max_words_per_text=30, skip=2, n_pairs=5000000, corpus='../data/enwiki.txt', w2v_model='../data/model/minimal', pairs_file='../data/pairs/enwiki_pairs_r.txt', no_pairs_file='../data/pairs/enwiki_no_pairs_r.txt'): w = w2v() print 'Loading model...' w.load_minimal(w2v_model) print 'Done.' o = open(pairs_file, 'w') n = open(no_pairs_file, 'w') pool = [] add1 = 0 add2 = 10 current_pair = 0.0 f = open(corpus, 'r') for line in f: words = line.split() #get words of current paragraph words_per_text_1 = randint(10, max_words_per_text) words_per_text_2 = randint(10, max_words_per_text) if len(words) >= words_per_text_1 + words_per_text_2 + (words_per_text_1 + words_per_text_2)/4: #no of words needs to be sufficiently high pair1 = [] pair2 = [] while len(pair1) < words_per_text_1 and len(words) > 0: #add words to first part of pair current = words.pop(0) if w.exists_word(current): pair1.append(current) for s in xrange(skip): if len(words) > 0: words.pop(0) while len(pair2) < words_per_text_2 and len(words) > 0: #add words to second part of pair current = words.pop(0) if w.exists_word(current): pair2.append(current) if len(pair1) == words_per_text_1 and len(pair2) == words_per_text_2: if add1 == 0: #add pair1 to the pool pool.append(pair1) if add2 == 0: #add pair2 to the pool pool.append(pair2) add1 = (add1 + 1) % 20 add2 = (add2 + 1) % 20 o.write(' '.join(pair1) + ';' + ' '.join(pair2) + '\n') #write pairs to output file current_pair += 1.0 if len(pool) >= 100: #process 'no pairs' print 'Progress %.3f%%' % (100.0*current_pair / n_pairs) for i in xrange(10): shuffle(pool) for p in xrange(len(pool)-1): n.write(' '.join(pool[p]) + ';' + ' '.join(pool[p+1]) + '\n') #write pairs to output file pool = [] o.flush() n.flush() if current_pair >= n_pairs: break f.close() o.close()
max = np.max(d) if np.min(d) < min: min = np.min(d) return min, max if __name__ == '__main__': f = open('../data/wiki/model/docfreq.npy') docfreqs = np.load(f) f.close() f = open('../data/tweets/model/idf_weights.npy') idf_weights = np.load(f) f.close() w = w2v() w.load_minimal('../data/wiki/model/minimal') texts = [ '../data/wiki/pairs/enwiki_no_pairs_10.txt', '../data/wiki/pairs/enwiki_pairs_10.txt' ] labels = ['Pairs', 'No pairs'] colors = ['0.75', '0.45'] # s = 'anarchism is a political philosophy that advocated stateless societies often self governed voluntary institutions but that several authors have defined'.split() # sv = [] # for word in s: # sv.append(w.get_vector(word)) # for k in xrange(len(sv)): # make_plot_from_vector(sv[k], s[k], 'vector_'+str(k)+'.png')
import gensim import matplotlib from w2v import w2v import metrics import similarity_plots as sp f = open('../data/google/model/docfreq.npy') docfreqs = np.load(f) f.close() # f = open('../data/tweets/model/idf_weights.npy') # idf_weights = np.load(f) # f.close() w = w2v() #w.load_minimal('../data/google/model/minimal') labels = ['No pairs', 'Pairs'] colors = ['0.75', '0.45'] # tables = ['../data/google/pairs/sets/tfidf_no_pairs_r-validation.npy', '../data/google/pairs/sets/tfidf_pairs_r-validation.npy'] # min, max = sp.calculate_min_max_from_table(tables) # split = sp.calculate_split_from_table(tables, verbose=False, normalize=(min, max)) # tables = ['../data/google/pairs/sets/tfidf_no_pairs_r-test.npy', '../data/google/pairs/sets/tfidf_pairs_r-test.npy'] # sp.calculate_error_rate_from_table(tables, split, normalize=(min, max)) # sp.calculate_JS_from_table(tables, normalize=(min, max), verbose=True) # # tables = ['../data/google/pairs/sets/mean_no_pairs_r-validation.npy', '../data/google/pairs/sets/mean_pairs_r-validation.npy'] # min, max = sp.calculate_min_max_from_table(tables)
''' Coding Just for Fun Created by burness on 16/3/6. ''' from data_utils import * from w2v import w2v random.seed(314) data = StanfordSentiment() tokens = data.tokens() nWords = len(tokens) dimVectors = 10 C = 5 word2vec_model = w2v(data, C=C) random.seed(2016) np.random.seed(2016) wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors0 = word2vec_model.sgd(lambda vec: word2vec_model.word2vec_sgd_wrapper(word2vec_model.cbow, tokens, vec, data, C, word2vec_model.softmax_cost_grad), wordVectors, 0.3, 2000, None, True, PRINT_EVERY=10) wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) print "\n=== For autograder ===" checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"] checkIdx = [tokens[word] for word in checkWords] checkVecs = wordVectors[checkIdx, :] print checkVecs
def extract_random(max_words_per_text=30, skip=2, n_pairs=5000000, corpus='../data/enwiki.txt', w2v_model='../data/model/minimal', pairs_file='../data/pairs/enwiki_pairs_r.txt', no_pairs_file='../data/pairs/enwiki_no_pairs_r.txt'): w = w2v() print 'Loading model...' w.load_minimal(w2v_model) print 'Done.' o = open(pairs_file, 'w') n = open(no_pairs_file, 'w') pool = [] add1 = 0 add2 = 10 current_pair = 0.0 f = open(corpus, 'r') for line in f: words = line.split() #get words of current paragraph words_per_text_1 = randint(10, max_words_per_text) words_per_text_2 = randint(10, max_words_per_text) if len(words) >= words_per_text_1 + words_per_text_2 + ( words_per_text_1 + words_per_text_2 ) / 4: #no of words needs to be sufficiently high pair1 = [] pair2 = [] while len(pair1) < words_per_text_1 and len( words) > 0: #add words to first part of pair current = words.pop(0) if w.exists_word(current): pair1.append(current) for s in xrange(skip): if len(words) > 0: words.pop(0) while len(pair2) < words_per_text_2 and len( words) > 0: #add words to second part of pair current = words.pop(0) if w.exists_word(current): pair2.append(current) if len(pair1) == words_per_text_1 and len( pair2) == words_per_text_2: if add1 == 0: #add pair1 to the pool pool.append(pair1) if add2 == 0: #add pair2 to the pool pool.append(pair2) add1 = (add1 + 1) % 20 add2 = (add2 + 1) % 20 o.write(' '.join(pair1) + ';' + ' '.join(pair2) + '\n') #write pairs to output file current_pair += 1.0 if len(pool) >= 100: #process 'no pairs' print 'Progress %.3f%%' % (100.0 * current_pair / n_pairs) for i in xrange(10): shuffle(pool) for p in xrange(len(pool) - 1): n.write(' '.join(pool[p]) + ';' + ' '.join(pool[p + 1]) + '\n') #write pairs to output file pool = [] o.flush() n.flush() if current_pair >= n_pairs: break f.close() o.close()