def cooc(): """Computes GloVe cooccurrence matrix given a vocabulary and the pos. and neg. corpora. Entries in the cooccurrence matrix are weighted by the inverse of the distance of the two words. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('COOCCURRENCES') if reuse_computed and os.path.isfile(vocab_dir+cooc_file+'.pkl'): if verbose > 0: print('Reusing cooccurrence matrix:', cooc_file) print_header_str('DONE') print() return with open(vocab_dir+vocab_file+'.pkl', 'rb') as f: vocab = pickle.load(f) cooc_dict = dict() counter = 0 tot = (count_file_lines(tweet_dir + emb_train_tweets_pos) + count_file_lines(tweet_dir + emb_train_tweets_neg) + count_file_lines(tweet_dir + emb_test_tweets)) if verbose == 1: print_progress_bar(0, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete') for fn in [tweet_dir + emb_train_tweets_pos, tweet_dir + emb_train_tweets_neg, tweet_dir + emb_test_tweets]: with open(fn) as f: for line in f: # keeps tokens that are not in vocab for proper window construction tokens = [vocab.get(t, -1) for t in line.strip().split()] n = len(tokens) for i in range(n): for j in range(max(0,i-emb_context_window),min(n,i+emb_context_window)): if i != j and tokens[i] > 0 and tokens[j] > 0: tok = (tokens[i],tokens[j]) cooc_dict[tok] = cooc_dict.get(tok,0)+1/abs(i-j) counter += 1 if verbose == 1 and (counter % 5000 == 0 or counter == tot): print_progress_bar(counter, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete') data = list(cooc_dict.values()) row = [k1 for k1,k2 in cooc_dict.keys()] col = [k2 for k1,k2 in cooc_dict.keys()] cooc = coo_matrix((data, (row, col))) with open(vocab_dir+cooc_file+'.pkl', 'wb') as f: pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL) if verbose > 0: print("{} nonzero entries.".format(cooc.nnz)) print_header_str('DONE') print()
def word2vec(): """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('WORD2VEC') if (reuse_computed and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy') and os.path.isfile(vocab_dir+vocab_file+'.pkl')): if verbose > 0: print('Reusing word2vec vocab:', vocab_file) print('Reusing word2vec embeddings:', selected_embeddings_file) print_header_str('DONE') print() return dataset=[] for fn in [tweet_dir+emb_train_tweets_pos, tweet_dir+emb_train_tweets_neg,tweet_dir+emb_test_tweets]: with open(fn) as f: for line in f: tokens = line.strip().split() dataset.append(tokens) model = Word2Vec(dataset, size=embedding_dim, window=emb_context_window, min_count=emb_word_min_count, workers=6, iter=embedding_epochs, sg=1, compute_loss=True) X = model.wv.vectors if embedding_norm: X = normalize_matrix(X) np.save(embeddings_dir+selected_embeddings_file, X) vocab = dict() for idx, line in enumerate(model.wv.vocab): vocab[line.strip()] = idx with open(vocab_dir+vocab_file+'.pkl', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) if verbose > 0: print('Vocabulary size:', len(vocab)) print('Training loss:', model.get_latest_training_loss()) print_header_str('DONE') print()
def create_vocab(): """Extracts GloVe vocabulary given the positive and negative corpora. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('VOCABULARY') if reuse_computed and os.path.isfile(vocab_dir + vocab_file + '.pkl'): if verbose > 0: print('Reusing vocabulary:', vocab_file) print_header_str('DONE') print() return vocab_filename = vocab_file + '.txt' vocab_filename_cut = vocab_file + '_cut.txt' # String format for SED argument (only keep word that appear at least min_count times) cut_str = '\\({}\\)'.format(''.join([ str(i) + ('\\|' if i < emb_word_min_count - 1 else '') for i in range(0, emb_word_min_count) ])) command_create_vocab = "cat {} {} {} | sed \"s/ /\\n/g\" | grep -v \"^\\s*$\" | sort | uniq -c > {}".format( tweet_dir + emb_train_tweets_pos, tweet_dir + emb_train_tweets_neg, tweet_dir + emb_test_tweets, vocab_dir + vocab_filename) command_cut_vocab = "cat {} | sed \"s/^\\s\\+//g\" | sort -rn | grep -v \"^{}\\s\" | cut -d' ' -f2 > {}".format( vocab_dir + vocab_filename, cut_str, vocab_dir + vocab_filename_cut) utils.run_script(command_create_vocab) utils.run_script(command_cut_vocab) vocab = dict() with open(vocab_dir + vocab_filename_cut) as f: for idx, line in enumerate(f): vocab[line.strip()] = idx with open(vocab_dir + vocab_file + '.pkl', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) utils.run_script( 'rm', [vocab_dir + vocab_filename, vocab_dir + vocab_filename_cut]) if verbose > 0: print('Size:', len(vocab), 'words') print_header_str('DONE') print()
def stanford_glove(): """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('STANFORD GLOVE') if (reuse_computed and os.path.isfile(embeddings_dir + selected_embeddings_file + '.npy') and os.path.isfile(vocab_dir + vocab_file + '.pkl')): if verbose > 0: print('Reusing word2vec vocab:', vocab_file) print('Reusing word2vec embeddings:', selected_embeddings_file) print_header_str('DONE') print() return dataset = [] stanford_root_dir = embeddings_dir + '../StanfordGloVe/' with open(stanford_root_dir + 'run.sh', 'w') as frun: frun.write(f"""\ #!/bin/bash set -e pushd {stanford_root_dir} make popd # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. # One optional argument can specify the language used for eval script: matlab, octave or [default] python CORPUS="{tweet_dir+emb_train_tweets_pos} {tweet_dir+emb_train_tweets_neg} {tweet_dir+emb_test_tweets}" VOCAB_FILE={stanford_root_dir+vocab_file}_cnt.txt COOCCURRENCE_FILE={stanford_root_dir}cooccurrence.bin COOCCURRENCE_SHUF_FILE={stanford_root_dir}cooccurrence.shuf.bin BUILDDIR={stanford_root_dir}build SAVE_FILE={stanford_root_dir+selected_embeddings_file}_tmp VERBOSE=2 MEMORY=8.0 VOCAB_MIN_COUNT={emb_word_min_count} VECTOR_SIZE={embedding_dim} MAX_ITER={embedding_epochs} WINDOW_SIZE={emb_context_window} BINARY=2 NUM_THREADS=6 X_MAX=100 echo echo "$ cat CORPUS | BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > VOCAB_FILE" cat $CORPUS | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > $VOCAB_FILE echo "$ cat CORPUS | BUILDDIR/cooccur -memory $MEMORY -vocab-file VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > COOCCURRENCE_FILE" cat $CORPUS | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > $COOCCURRENCE_FILE echo "$ BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < COOCCURRENCE_FILE > COOCCURRENCE_SHUF_FILE" $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE echo "$ BUILDDIR/glove -save-file SAVE_FILE -threads $NUM_THREADS -input-file COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file VOCAB_FILE -verbose $VERBOSE" $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE rm $COOCCURRENCE_FILE $COOCCURRENCE_SHUF_FILE """) stanford_glove_cmd = 'chmod +x ' + stanford_root_dir + 'run.sh && ' stanford_glove_cmd += stanford_root_dir + 'run.sh' run_script(stanford_glove_cmd) vocab_size = sum(1 for line in open( stanford_root_dir + selected_embeddings_file + '_tmp.txt', 'r')) vocab = {} embeddings = np.zeros((vocab_size, embedding_dim), dtype='float32') with open(stanford_root_dir + selected_embeddings_file + '_tmp.txt', 'r') as f: for i, l in enumerate(f): ll = l.strip().split(' ') word, emb = ll[0].strip(), [float(x.strip()) for x in ll[1:]] vocab[word] = i embeddings[i] = np.array(emb) if embedding_norm: embeddings = normalize_matrix(embeddings) np.save(embeddings_dir + selected_embeddings_file, embeddings) with open(vocab_dir + vocab_file + '.pkl', 'wb') as f: pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL) cleanup_cmd = f'rm {stanford_root_dir+vocab_file}_cnt.txt {stanford_root_dir+selected_embeddings_file}_tmp.txt ; rm -rf {stanford_root_dir}build' run_script(cleanup_cmd) if verbose > 0: print('Vocabulary size:', len(vocab)) print_header_str('DONE') print()
def glove(): """Computes GloVe embeddings given a vocabulary and a corresponding cooccurrence matrix. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab :glove_polarization - polarization factor for embedding initialization (with rel. freq) """ if verbose > 0: print_header_str('EMBEDDINGS') if reuse_computed and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy'): if verbose > 0: print('Reusing embeddings:', selected_embeddings_file) print_header_str('DONE') print() return if verbose > 0: print("Loading cooccurrence matrix.") with open(vocab_dir+cooc_file+'.pkl', 'rb') as f: cooc = pickle.load(f) nmax = 100 if verbose > 0: print("\tUsing nmax =", nmax, ", with cooc.max() =", cooc.max(),end='\n\n') print("Initializing embeddings with U~[-.5,.5] distribution: ", (cooc.shape[0], embedding_dim+1), (cooc.shape[1], embedding_dim+1), flush=True, end='\n\n') xs = np.random.uniform(size=(cooc.shape[0], embedding_dim+1)) - .5 ys = np.random.uniform(size=(cooc.shape[1], embedding_dim+1)) - .5 xs /= (embedding_dim+1) ys /= (embedding_dim+1) # Bias term is incorporated in word embedding xs[:,embedding_dim] = 1 ys[:,embedding_dim-1] = 1 if glove_polarization > 0: if verbose > 0: print('Adding polarization to random initial embeddings. Factor:', glove_polarization, end='\n\n') ### Get bias for positive and negative words ### vocab_pos = pickle.load(open(tweet_dir+emb_polar_vocab.format('pos'), 'rb')) vocab_neg = pickle.load(open(tweet_dir+emb_polar_vocab.format('neg'), 'rb')) polarization = sentiment_polarization(vocab_pos, vocab_neg) vocab = pickle.load(open(vocab_dir+vocab_file+'.pkl', 'rb')) ############### Add polarization ################ split = (embedding_dim-1)//2 for word,id in vocab.items(): if word in polarization: polar = polarization[word] else: polar = .5 xs[id,:split] += glove_polarization*polar / (embedding_dim+1) xs[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1) ys[id,:split] += glove_polarization*polar / (embedding_dim+1) ys[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1) ################################################# eta = 0.05 alpha = 3 / 4 prev_loss = 0.0 data = [(i,j,n) for i,j,n in zip(cooc.row,cooc.col, cooc.data)] for ix, jy, n in data: w = min( 1., (n/nmax)**alpha ) x,y = xs[ix], ys[jy] increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) ) x_upd = xs[ix] + increase_mul*y y_upd = ys[jy] + increase_mul*x prev_loss += w * ( log(n) - np.dot(x_upd, y_upd) )**2 for epoch in range(embedding_epochs): loss = 0.0 random.shuffle(data) if verbose == 1: print_progress_bar(0,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss)) counter,missed_updates=0,0 for ix, jy, n in data: counter+=1 w = min( 1., (n/nmax)**alpha ) x,y = xs[ix], ys[jy] increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) ) x_upd = xs[ix] + increase_mul*y y_upd = ys[jy] + increase_mul*x loss_delta = w * ( log(n) - np.dot(x_upd, y_upd) )**2 # Undo the current update if (np.isnan(x_upd).any() or np.isinf(x_upd).any() or np.isnan(y_upd).any() or np.isinf(y_upd).any() or np.isnan(loss+loss_delta) or np.isinf(loss+loss_delta)): missed_updates += 1 loss += w * ( log(n) - np.dot(xs[ix], ys[jy]) )**2 if (counter % 5000 == 0 or counter == len(data)) and verbose == 1: print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss)) continue xs[ix] = x_upd ys[jy] = y_upd # Reset bias xs[ix,embedding_dim] = 1 ys[jy,embedding_dim-1] = 1 loss += loss_delta if (counter % 50000 == 0 or counter == len(data)) and verbose == 1: print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss)) ### BOLD DRIVER LEARNING RATE ### if prev_loss > loss or epoch==0: eta += 0.01*eta else: eta /= 2 prev_loss = loss ################################# if verbose > 0: print("Epoch {:2d} loss : {:10.2f}".format(epoch+1, loss)) print('Missed {:4d} updates due to overflow prevention'.format(missed_updates)) print('Current learning rate: {:1.3f}'.format(eta), end='\n\n', flush=True) if (epoch+1) % 10 == 0 and epoch+1 != embedding_epochs: X = xs[:,:embedding_dim] if embedding_norm: X = normalize_matrix(X) np.save(embeddings_dir+glove_embedding_file_suffix(epoch+1), X) # Note: the bias for xs is in position embedding_dim-1 X = xs[:,:embedding_dim] if embedding_norm: X = normalize_matrix(X) np.save(embeddings_dir+glove_embedding_file_suffix(embedding_epochs), X) if verbose > 0: print_header_str('DONE') print()
def compute_dataset_from_embeddings(vocab=None, embeddings=None): """Creates matrix of tweet embeddings for baseline. Sums embeddings of words in tweet (possibly weighting the sum with other metrics - entropy, salience...) # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :embedding_dim - size of embeddings :misc, all other configurations are embedding-specific (they have influence on the outcome, but not on the functioning of the module) """ if verbose > 0: print_header_str('PREPARE DATASET') if vocab is None: with open(vocab_dir + vocab_file + '.pkl', 'rb') as f: vocab = pickle.load(f) if embeddings is None: embeddings = np.load(embeddings_dir + selected_embeddings_file + '.npy', allow_pickle=True) n_train = count_file_lines(tweet_dir + cls_train_tweets_pos) + count_file_lines( tweet_dir + cls_train_tweets_neg) d = embeddings.shape[1] x_train = np.zeros((n_train, d), dtype='float32') y_train = np.zeros(n_train, dtype='float32') counter = 0 if verbose == 1: print_progress_bar(0, n_train, prefix='Embedding training tweets:') for fn in [ tweet_dir + cls_train_tweets_pos, tweet_dir + cls_train_tweets_neg ]: if 'tags' in fn: continue curr_file_class = 1 if 'pos' in fn else 0 with open(fn) as f: if load_tags: fn_tag = os.path.splitext(fn)[0] + '_tags.txt' f_tag = open(fn_tag).readlines() for line_id, line in enumerate(f): line_tag = None line = line.strip().split() if load_tags: line_tag = f_tag[line_id].strip().split() line = [ tok + (tag if tag in pos_to_emb else '') for tok, tag in zip(line, line_tag) ] x_train[counter] = get_emb_sum(embeddings, vocab, line) y_train[counter] = curr_file_class counter += 1 if verbose == 1 and (counter % 5000 == 0 or counter == n_train): print_progress_bar(counter, n_train, prefix='Embedding training tweets:') n_test = count_file_lines(tweet_dir + test_tweets) x_test = np.zeros((n_test, d), dtype='float32') counter = 0 if verbose == 1: print_progress_bar(0, n_train, prefix='Embedding test tweets: ') for fn in [tweet_dir + test_tweets]: with open(fn) as f: if load_tags: fn_tag = os.path.splitext(fn)[0] + '_tags.txt' f_tag = open(fn_tag).readlines() for line_id, line in enumerate(f): line = (''.join(line.split(',')[1:])).strip().split() line_tag = None if load_tags: line_tag = (''.join( f_tag[line_id].split(',')[1:])).strip().split() line = [ tok + (tag if tag in pos_to_emb else '') for tok, tag in zip(line, line_tag) ] x_test[counter] = get_emb_sum(embeddings, vocab, line) counter += 1 if verbose == 1 and (counter % 1000 == 0 or counter == n_test): print_progress_bar(counter, n_test, prefix='Embedding test tweets: ') if verbose > 0: print_header_str('DONE') print() return x_train, y_train, x_test