Пример #1
0
def cooc():
    """Computes GloVe cooccurrence matrix given a vocabulary and the pos. and neg. corpora.
    Entries in the cooccurrence matrix are weighted by the inverse of the distance of the two words.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('COOCCURRENCES')
    if reuse_computed and os.path.isfile(vocab_dir+cooc_file+'.pkl'):
        if verbose > 0:
            print('Reusing cooccurrence matrix:', cooc_file)
            print_header_str('DONE')
            print()
        return

    with open(vocab_dir+vocab_file+'.pkl', 'rb') as f:
        vocab = pickle.load(f)

    cooc_dict = dict()
    counter = 0

    tot = (count_file_lines(tweet_dir + emb_train_tweets_pos) +
            count_file_lines(tweet_dir + emb_train_tweets_neg) +
            count_file_lines(tweet_dir + emb_test_tweets))
    
    if verbose == 1:
        print_progress_bar(0, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete')

    for fn in [tweet_dir + emb_train_tweets_pos, tweet_dir + emb_train_tweets_neg, tweet_dir + emb_test_tweets]:
        with open(fn) as f:
            for line in f:

                # keeps tokens that are not in vocab for proper window construction
                tokens = [vocab.get(t, -1) for t in line.strip().split()]
                
                n = len(tokens)
                for i in range(n):
                    for j in range(max(0,i-emb_context_window),min(n,i+emb_context_window)):
                        if i != j and tokens[i] > 0 and tokens[j] > 0:
                            tok = (tokens[i],tokens[j])
                            cooc_dict[tok] = cooc_dict.get(tok,0)+1/abs(i-j)
                counter += 1
                if verbose == 1 and (counter % 5000 == 0 or counter == tot):
                    print_progress_bar(counter, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete')
    
    data = list(cooc_dict.values())
    row = [k1 for k1,k2 in cooc_dict.keys()]
    col = [k2 for k1,k2 in cooc_dict.keys()]

    cooc = coo_matrix((data, (row, col)))

    with open(vocab_dir+cooc_file+'.pkl', 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)
    if verbose > 0:
        print("{} nonzero entries.".format(cooc.nnz))
        print_header_str('DONE')
        print()
Пример #2
0
def word2vec():
    """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :embedding_dim          - size of embeddings
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('WORD2VEC')

    if (reuse_computed 
        and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy')
        and os.path.isfile(vocab_dir+vocab_file+'.pkl')):
        if verbose > 0:
            print('Reusing word2vec vocab:', vocab_file)
            print('Reusing word2vec embeddings:', selected_embeddings_file)
            print_header_str('DONE')
            print()
        return

    dataset=[]

    for fn in [tweet_dir+emb_train_tweets_pos, tweet_dir+emb_train_tweets_neg,tweet_dir+emb_test_tweets]:
        with open(fn) as f:
            for line in f:
                tokens = line.strip().split()
                dataset.append(tokens)
    
    model = Word2Vec(dataset, 
                size=embedding_dim, window=emb_context_window, 
                min_count=emb_word_min_count, workers=6, 
                iter=embedding_epochs, sg=1, compute_loss=True)
    
    X = model.wv.vectors
    if embedding_norm:
        X = normalize_matrix(X)

    np.save(embeddings_dir+selected_embeddings_file, X)

    vocab = dict()
    for idx, line in enumerate(model.wv.vocab):
        vocab[line.strip()] = idx
        
    with open(vocab_dir+vocab_file+'.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

    if verbose > 0:
        print('Vocabulary size:', len(vocab))
        print('Training loss:', model.get_latest_training_loss())
        print_header_str('DONE')
    print()
Пример #3
0
def create_vocab():
    """Extracts GloVe vocabulary given the positive and negative corpora.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('VOCABULARY')
    if reuse_computed and os.path.isfile(vocab_dir + vocab_file + '.pkl'):
        if verbose > 0:
            print('Reusing vocabulary:', vocab_file)
            print_header_str('DONE')
            print()
        return

    vocab_filename = vocab_file + '.txt'
    vocab_filename_cut = vocab_file + '_cut.txt'

    # String format for SED argument (only keep word that appear at least min_count times)
    cut_str = '\\({}\\)'.format(''.join([
        str(i) + ('\\|' if i < emb_word_min_count - 1 else '')
        for i in range(0, emb_word_min_count)
    ]))
    command_create_vocab = "cat {} {} {} | sed \"s/ /\\n/g\" | grep -v \"^\\s*$\" | sort | uniq -c > {}".format(
        tweet_dir + emb_train_tweets_pos, tweet_dir + emb_train_tweets_neg,
        tweet_dir + emb_test_tweets, vocab_dir + vocab_filename)
    command_cut_vocab = "cat {} | sed \"s/^\\s\\+//g\" | sort -rn | grep -v \"^{}\\s\" | cut -d' ' -f2 > {}".format(
        vocab_dir + vocab_filename, cut_str, vocab_dir + vocab_filename_cut)

    utils.run_script(command_create_vocab)
    utils.run_script(command_cut_vocab)

    vocab = dict()
    with open(vocab_dir + vocab_filename_cut) as f:
        for idx, line in enumerate(f):
            vocab[line.strip()] = idx

    with open(vocab_dir + vocab_file + '.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

    utils.run_script(
        'rm', [vocab_dir + vocab_filename, vocab_dir + vocab_filename_cut])

    if verbose > 0:
        print('Size:', len(vocab), 'words')
        print_header_str('DONE')
        print()
Пример #4
0
def stanford_glove():
    """Computes Word2vec embeddings, retrieving corpus from positive and negative tweet files.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :embedding_dim          - size of embeddings
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('STANFORD GLOVE')

    if (reuse_computed and
            os.path.isfile(embeddings_dir + selected_embeddings_file + '.npy')
            and os.path.isfile(vocab_dir + vocab_file + '.pkl')):
        if verbose > 0:
            print('Reusing word2vec vocab:', vocab_file)
            print('Reusing word2vec embeddings:', selected_embeddings_file)
            print_header_str('DONE')
            print()
        return

    dataset = []

    stanford_root_dir = embeddings_dir + '../StanfordGloVe/'

    with open(stanford_root_dir + 'run.sh', 'w') as frun:
        frun.write(f"""\
#!/bin/bash
set -e

pushd {stanford_root_dir}
make
popd

# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
# One optional argument can specify the language used for eval script: matlab, octave or [default] python

CORPUS="{tweet_dir+emb_train_tweets_pos} {tweet_dir+emb_train_tweets_neg} {tweet_dir+emb_test_tweets}"
VOCAB_FILE={stanford_root_dir+vocab_file}_cnt.txt
COOCCURRENCE_FILE={stanford_root_dir}cooccurrence.bin
COOCCURRENCE_SHUF_FILE={stanford_root_dir}cooccurrence.shuf.bin
BUILDDIR={stanford_root_dir}build
SAVE_FILE={stanford_root_dir+selected_embeddings_file}_tmp
VERBOSE=2
MEMORY=8.0
VOCAB_MIN_COUNT={emb_word_min_count}
VECTOR_SIZE={embedding_dim}
MAX_ITER={embedding_epochs}
WINDOW_SIZE={emb_context_window}
BINARY=2
NUM_THREADS=6
X_MAX=100

echo
echo "$ cat CORPUS | BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > VOCAB_FILE"
cat $CORPUS | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE > $VOCAB_FILE

echo "$ cat CORPUS | BUILDDIR/cooccur -memory $MEMORY -vocab-file VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > COOCCURRENCE_FILE"
cat $CORPUS | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE > $COOCCURRENCE_FILE

echo "$ BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < COOCCURRENCE_FILE > COOCCURRENCE_SHUF_FILE"
$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE

echo "$ BUILDDIR/glove -save-file SAVE_FILE -threads $NUM_THREADS -input-file COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file VOCAB_FILE -verbose $VERBOSE"
$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE

rm $COOCCURRENCE_FILE $COOCCURRENCE_SHUF_FILE
        """)

    stanford_glove_cmd = 'chmod +x ' + stanford_root_dir + 'run.sh && '
    stanford_glove_cmd += stanford_root_dir + 'run.sh'
    run_script(stanford_glove_cmd)

    vocab_size = sum(1 for line in open(
        stanford_root_dir + selected_embeddings_file + '_tmp.txt', 'r'))
    vocab = {}
    embeddings = np.zeros((vocab_size, embedding_dim), dtype='float32')

    with open(stanford_root_dir + selected_embeddings_file + '_tmp.txt',
              'r') as f:
        for i, l in enumerate(f):
            ll = l.strip().split(' ')
            word, emb = ll[0].strip(), [float(x.strip()) for x in ll[1:]]

            vocab[word] = i
            embeddings[i] = np.array(emb)

    if embedding_norm:
        embeddings = normalize_matrix(embeddings)

    np.save(embeddings_dir + selected_embeddings_file, embeddings)

    with open(vocab_dir + vocab_file + '.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

    cleanup_cmd = f'rm {stanford_root_dir+vocab_file}_cnt.txt {stanford_root_dir+selected_embeddings_file}_tmp.txt ; rm -rf {stanford_root_dir}build'
    run_script(cleanup_cmd)

    if verbose > 0:
        print('Vocabulary size:', len(vocab))
        print_header_str('DONE')
    print()
Пример #5
0
def glove():
    """Computes GloVe embeddings given a vocabulary and a corresponding cooccurrence matrix.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :embedding_dim          - size of embeddings
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
        :glove_polarization     - polarization factor for embedding initialization (with rel. freq)
    """
    if verbose > 0:
        print_header_str('EMBEDDINGS')
    
    if reuse_computed and os.path.isfile(embeddings_dir+selected_embeddings_file+'.npy'):
        if verbose > 0:
            print('Reusing embeddings:', selected_embeddings_file)
            print_header_str('DONE')
            print()
        return

    if verbose > 0:
        print("Loading cooccurrence matrix.")

    with open(vocab_dir+cooc_file+'.pkl', 'rb') as f:
        cooc = pickle.load(f)

    nmax = 100
    
    if verbose > 0:
        print("\tUsing nmax =", nmax, ", with cooc.max() =", cooc.max(),end='\n\n')

        print("Initializing embeddings with U~[-.5,.5] distribution: ", 
            (cooc.shape[0], embedding_dim+1),
            (cooc.shape[1], embedding_dim+1), flush=True, end='\n\n')
    
    xs = np.random.uniform(size=(cooc.shape[0], embedding_dim+1)) - .5
    ys = np.random.uniform(size=(cooc.shape[1], embedding_dim+1)) - .5

    xs /= (embedding_dim+1)
    ys /= (embedding_dim+1)
    # Bias term is incorporated in word embedding
    xs[:,embedding_dim] = 1
    ys[:,embedding_dim-1] = 1
    
    if glove_polarization > 0:
        if verbose > 0:
            print('Adding polarization to random initial embeddings. Factor:', glove_polarization, end='\n\n')
        ### Get bias for positive and negative words ###
        vocab_pos = pickle.load(open(tweet_dir+emb_polar_vocab.format('pos'), 'rb'))
        vocab_neg = pickle.load(open(tweet_dir+emb_polar_vocab.format('neg'), 'rb'))
        polarization = sentiment_polarization(vocab_pos, vocab_neg)

        vocab = pickle.load(open(vocab_dir+vocab_file+'.pkl', 'rb'))

        ############### Add polarization ################
        split = (embedding_dim-1)//2
        for word,id in vocab.items():
            if word in polarization:
                polar = polarization[word]
            else:
                polar = .5
            xs[id,:split] += glove_polarization*polar / (embedding_dim+1)
            xs[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1)
            ys[id,:split] += glove_polarization*polar / (embedding_dim+1)
            ys[id,split:embedding_dim-1] -= glove_polarization*(1-polar) / (embedding_dim+1)
        #################################################
    
    eta = 0.05
    alpha = 3 / 4

    prev_loss = 0.0

    data = [(i,j,n) for i,j,n in zip(cooc.row,cooc.col, cooc.data)]

    for ix, jy, n in data:
            w = min( 1., (n/nmax)**alpha )
            x,y = xs[ix], ys[jy]
            increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) )

            x_upd = xs[ix] + increase_mul*y
            y_upd = ys[jy] + increase_mul*x

            prev_loss += w * ( log(n) - np.dot(x_upd, y_upd) )**2
    
    for epoch in range(embedding_epochs):
        loss = 0.0
        random.shuffle(data)

        if verbose == 1:
            print_progress_bar(0,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss))
        counter,missed_updates=0,0
        for ix, jy, n in data:
            counter+=1
            w = min( 1., (n/nmax)**alpha )
            x,y = xs[ix], ys[jy]
            increase_mul = 2*eta*w * ( log(n) - np.dot(x, y) )

            x_upd = xs[ix] + increase_mul*y
            y_upd = ys[jy] + increase_mul*x

            loss_delta = w * ( log(n) - np.dot(x_upd, y_upd) )**2

            # Undo the current update
            if (np.isnan(x_upd).any() or np.isinf(x_upd).any() or
                    np.isnan(y_upd).any() or np.isinf(y_upd).any() or
                    np.isnan(loss+loss_delta) or np.isinf(loss+loss_delta)):
                missed_updates += 1
                loss += w * ( log(n) - np.dot(xs[ix], ys[jy]) )**2
                if (counter % 5000 == 0 or counter == len(data)) and verbose == 1:
                    print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss))
                continue

            xs[ix] = x_upd
            ys[jy] = y_upd

            # Reset bias
            xs[ix,embedding_dim] = 1
            ys[jy,embedding_dim-1] = 1

            loss += loss_delta
            if (counter % 50000 == 0 or counter == len(data)) and verbose == 1:
                print_progress_bar(counter,len(data), prefix='Epoch {:2d}/{:2d}:'.format(epoch+1,embedding_epochs),suffix='- loss difference {:8.2f}'.format(loss-prev_loss))

        ### BOLD DRIVER LEARNING RATE ###
        if prev_loss > loss or epoch==0:
            eta += 0.01*eta
        else:
            eta /= 2
        prev_loss = loss
        #################################

        if verbose > 0:
            print("Epoch {:2d} loss : {:10.2f}".format(epoch+1, loss))
            print('Missed {:4d} updates due to overflow prevention'.format(missed_updates))
            print('Current learning rate: {:1.3f}'.format(eta), end='\n\n', flush=True)

        if (epoch+1) % 10 == 0 and epoch+1 != embedding_epochs:
            X = xs[:,:embedding_dim]
            if embedding_norm:
                X = normalize_matrix(X)
            np.save(embeddings_dir+glove_embedding_file_suffix(epoch+1), X)
    
    # Note: the bias for xs is in position embedding_dim-1
    X = xs[:,:embedding_dim]
    if embedding_norm:
        X = normalize_matrix(X)
    np.save(embeddings_dir+glove_embedding_file_suffix(embedding_epochs), X)

    if verbose > 0:
        print_header_str('DONE')
        print()
def compute_dataset_from_embeddings(vocab=None, embeddings=None):
    """Creates matrix of tweet embeddings for baseline. 
    Sums embeddings of words in tweet (possibly weighting the sum with other metrics - entropy, salience...)
    # Configs
        :dataset_version    - choose preprocessing
        :emb_dataset        - choose full or small dataset
        :embedding_dim      - size of embeddings
        :misc, all other configurations are embedding-specific 
            (they have influence on the outcome, but not on the functioning of the module)
    """
    if verbose > 0:
        print_header_str('PREPARE DATASET')

    if vocab is None:
        with open(vocab_dir + vocab_file + '.pkl', 'rb') as f:
            vocab = pickle.load(f)
    if embeddings is None:
        embeddings = np.load(embeddings_dir + selected_embeddings_file +
                             '.npy',
                             allow_pickle=True)

    n_train = count_file_lines(tweet_dir +
                               cls_train_tweets_pos) + count_file_lines(
                                   tweet_dir + cls_train_tweets_neg)

    d = embeddings.shape[1]

    x_train = np.zeros((n_train, d), dtype='float32')
    y_train = np.zeros(n_train, dtype='float32')

    counter = 0

    if verbose == 1:
        print_progress_bar(0, n_train, prefix='Embedding training tweets:')

    for fn in [
            tweet_dir + cls_train_tweets_pos, tweet_dir + cls_train_tweets_neg
    ]:
        if 'tags' in fn:
            continue

        curr_file_class = 1 if 'pos' in fn else 0

        with open(fn) as f:
            if load_tags:
                fn_tag = os.path.splitext(fn)[0] + '_tags.txt'
                f_tag = open(fn_tag).readlines()
            for line_id, line in enumerate(f):
                line_tag = None
                line = line.strip().split()
                if load_tags:
                    line_tag = f_tag[line_id].strip().split()
                    line = [
                        tok + (tag if tag in pos_to_emb else '')
                        for tok, tag in zip(line, line_tag)
                    ]
                x_train[counter] = get_emb_sum(embeddings, vocab, line)
                y_train[counter] = curr_file_class
                counter += 1
                if verbose == 1 and (counter % 5000 == 0
                                     or counter == n_train):
                    print_progress_bar(counter,
                                       n_train,
                                       prefix='Embedding training tweets:')

    n_test = count_file_lines(tweet_dir + test_tweets)
    x_test = np.zeros((n_test, d), dtype='float32')
    counter = 0

    if verbose == 1:
        print_progress_bar(0, n_train, prefix='Embedding test tweets:    ')
    for fn in [tweet_dir + test_tweets]:
        with open(fn) as f:
            if load_tags:
                fn_tag = os.path.splitext(fn)[0] + '_tags.txt'
                f_tag = open(fn_tag).readlines()
            for line_id, line in enumerate(f):
                line = (''.join(line.split(',')[1:])).strip().split()
                line_tag = None
                if load_tags:
                    line_tag = (''.join(
                        f_tag[line_id].split(',')[1:])).strip().split()
                    line = [
                        tok + (tag if tag in pos_to_emb else '')
                        for tok, tag in zip(line, line_tag)
                    ]
                x_test[counter] = get_emb_sum(embeddings, vocab, line)

                counter += 1
                if verbose == 1 and (counter % 1000 == 0 or counter == n_test):
                    print_progress_bar(counter,
                                       n_test,
                                       prefix='Embedding test tweets:    ')

    if verbose > 0:
        print_header_str('DONE')
        print()

    return x_train, y_train, x_test