def translate(original_w2v='original_w2v', new_w2v='new_w2v', from_docfreqs='from.npy', to_docfreqs='to.npy'):
    wn = w2v()
    wn.load_minimal(new_w2v)
    n_words = len(wn.model.syn0)
    to_d = np.zeros(n_words)

    wo = w2v()
    wo.load_minimal(original_w2v)

    f = open(from_docfreqs, 'rb')
    from_d = np.load(f)
    f.close()

    #translate
    print 'Begin translation...'
    success = 0
    index = 0
    for word in wn.model.vocab:
        i = wn.model.vocab[word].index
        if wo.exists_word(word):
            j = wo.model.vocab[word].index
            to_d[i] = from_d[j]
            success += 1
        else:
            to_d[i] = 1.0
    print 'Success rate: ' + str(float(success)/len(wn.model.vocab))

    print 'Saving...'
    f = open(to_docfreqs, 'wb')
    np.save(f, to_d)
    f.close()
    print 'Done.'
示例#2
0
def translate(original_w2v='original_w2v',
              new_w2v='new_w2v',
              from_docfreqs='from.npy',
              to_docfreqs='to.npy'):
    wn = w2v()
    wn.load_minimal(new_w2v)
    n_words = len(wn.model.syn0)
    to_d = np.zeros(n_words)

    wo = w2v()
    wo.load_minimal(original_w2v)

    f = open(from_docfreqs, 'rb')
    from_d = np.load(f)
    f.close()

    #translate
    print 'Begin translation...'
    success = 0
    index = 0
    for word in wn.model.vocab:
        i = wn.model.vocab[word].index
        if wo.exists_word(word):
            j = wo.model.vocab[word].index
            to_d[i] = from_d[j]
            success += 1
        else:
            to_d[i] = 1.0
    print 'Success rate: ' + str(float(success) / len(wn.model.vocab))

    print 'Saving...'
    f = open(to_docfreqs, 'wb')
    np.save(f, to_d)
    f.close()
    print 'Done.'
    def __init__(self,
                 pairs_filename='pairs.txt',
                 no_pairs_filename='no_pairs.txt',
                 docfreq_filename='docfreqs.npy',
                 w2v_filename='minimal',
                 no_words=20,
                 embedding_dim=400,
                 batch_size=100):

        super(PairProcessor, self).__init__()

        self.pairs_filename = pairs_filename
        self.no_pairs_filename = no_pairs_filename
        self.batch_size = batch_size
        self.no_words = no_words
        self.embedding_dim = embedding_dim
        self.docfreq_filename = docfreq_filename
        self.w2v_filename = w2v_filename

        self.x1 = np.zeros((batch_size, embedding_dim, no_words),
                           dtype=theano.config.floatX)
        self.x2 = np.zeros((batch_size, embedding_dim, no_words),
                           dtype=theano.config.floatX)
        self.y = np.zeros((batch_size), dtype=theano.config.floatX)
        self.z = np.zeros((batch_size), dtype=theano.config.floatX)

        f = open(self.docfreq_filename)
        self.docfreqs = np.load(f)
        f.close()

        if isinstance(w2v_filename, basestring):
            self.w = w2v()
            self.w.load_minimal(self.w2v_filename)
        else:
            self.w = w2v_filename
示例#4
0
    def Create_Vectorizer(self, name, k, cat):

        if (name == 'CountVec'):
            return CountVectorizer(
                analyzer="word",
                stop_words=nltk.corpus.stopwords.words('portuguese'),
                max_features=5000)
        elif (name == 'NGram'):
            return CountVectorizer(analyzer="char",
                                   ngram_range=([3, 16]),
                                   tokenizer=None,
                                   preprocessor=None,
                                   max_features=3000)
        elif (name == 'TFidf'):
            return TfidfVectorizer(
                min_df=2, stop_words=nltk.corpus.stopwords.words('portuguese'))

        elif (name == 'selecao'):
            return Selecao(k, cat)

        elif (name == 'w2v'):
            return w2v()

        elif (name == 'w2v_mean'):
            return w2v_mean()
        else:
            raise NameError('Vectorizer not found')
    def __init__(self, pairs_filename='pairs.txt', no_pairs_filename='no_pairs.txt', docfreq_filename='docfreqs.npy',
                 w2v_filename='minimal', no_words=20, embedding_dim=400, batch_size=100):

        super(PairProcessor, self).__init__()

        self.pairs_filename = pairs_filename
        self.no_pairs_filename = no_pairs_filename
        self.batch_size = batch_size
        self.no_words = no_words
        self.embedding_dim = embedding_dim
        self.docfreq_filename = docfreq_filename
        self.w2v_filename = w2v_filename

        self.x1 = np.zeros((batch_size, embedding_dim, no_words), dtype=theano.config.floatX)
        self.x2 = np.zeros((batch_size, embedding_dim, no_words), dtype=theano.config.floatX)
        self.y = np.zeros((batch_size), dtype=theano.config.floatX)
        self.z = np.zeros((batch_size), dtype=theano.config.floatX)

        f = open(self.docfreq_filename)
        self.docfreqs = np.load(f)
        f.close()

        if isinstance(w2v_filename, basestring):
            self.w = w2v()
            self.w.load_minimal(self.w2v_filename)
        else:
            self.w = w2v_filename
def calculate(corpus='enwiki.txt', w2v_minimal_model='minimal', output_file='docfreq.npy'):
    w = w2v()
    w.load_minimal(w2v_minimal_model)
    n_words = len(w.model.vocab)
    freqs = np.zeros(n_words)

    i = 0
    with open(corpus) as f:
        for line in f:
            if i % 10000 == 0:
                print "Processing line " + str(i) + "..."
            i += 1
            words = set(line.split())
            for word in words:
                if w.exists_word(word):
                    freqs[w.model.vocab[word].index] += 1.0

    print 'Saving...'
    f = open(output_file, 'wb')
    np.save(f, freqs)
    f.close()
    print 'Done.'
示例#7
0
def calculate(corpus='enwiki.txt',
              w2v_minimal_model='minimal',
              output_file='docfreq.npy'):
    w = w2v()
    w.load_minimal(w2v_minimal_model)
    n_words = len(w.model.vocab)
    freqs = np.zeros(n_words)

    i = 0
    with open(corpus) as f:
        for line in f:
            if i % 10000 == 0:
                print "Processing line " + str(i) + "..."
            i += 1
            words = set(line.split())
            for word in words:
                if w.exists_word(word):
                    freqs[w.model.vocab[word].index] += 1.0

    print 'Saving...'
    f = open(output_file, 'wb')
    np.save(f, freqs)
    f.close()
    print 'Done.'
def extract_random(max_words_per_text=30,
            skip=2,
            n_pairs=5000000,
            corpus='../data/enwiki.txt',
            w2v_model='../data/model/minimal',
            pairs_file='../data/pairs/enwiki_pairs_r.txt',
            no_pairs_file='../data/pairs/enwiki_no_pairs_r.txt'):

    w = w2v()
    print 'Loading model...'
    w.load_minimal(w2v_model)
    print 'Done.'
    o = open(pairs_file, 'w')
    n = open(no_pairs_file, 'w')

    pool = []
    add1 = 0
    add2 = 10

    current_pair = 0.0

    f = open(corpus, 'r')
    for line in f:
        words = line.split() #get words of current paragraph

        words_per_text_1 = randint(10, max_words_per_text)
        words_per_text_2 = randint(10, max_words_per_text)

        if len(words) >= words_per_text_1 + words_per_text_2 + (words_per_text_1 + words_per_text_2)/4: #no of words needs to be sufficiently high
            pair1 = []
            pair2 = []
            while len(pair1) < words_per_text_1 and len(words) > 0: #add words to first part of pair
                current = words.pop(0)
                if w.exists_word(current):
                    pair1.append(current)
            for s in xrange(skip):
                if len(words) > 0:
                    words.pop(0)
            while len(pair2) < words_per_text_2 and len(words) > 0: #add words to second part of pair
                current = words.pop(0)
                if w.exists_word(current):
                    pair2.append(current)
            if len(pair1) == words_per_text_1 and len(pair2) == words_per_text_2:
                if add1 == 0: #add pair1 to the pool
                    pool.append(pair1)
                if add2 == 0: #add pair2 to the pool
                    pool.append(pair2)
                add1 = (add1 + 1) % 20
                add2 = (add2 + 1) % 20
                o.write(' '.join(pair1) + ';' + ' '.join(pair2) + '\n') #write pairs to output file
                current_pair += 1.0

        if len(pool) >= 100: #process 'no pairs'
            print 'Progress %.3f%%' % (100.0*current_pair / n_pairs)
            for i in xrange(10):
                shuffle(pool)
                for p in xrange(len(pool)-1):
                    n.write(' '.join(pool[p]) + ';' + ' '.join(pool[p+1]) + '\n') #write pairs to output file
            pool = []
            o.flush()
            n.flush()
            if current_pair >= n_pairs:
                break

    f.close()
    o.close()
            max = np.max(d)
        if np.min(d) < min:
            min = np.min(d)
    return min, max


if __name__ == '__main__':
    f = open('../data/wiki/model/docfreq.npy')
    docfreqs = np.load(f)
    f.close()

    f = open('../data/tweets/model/idf_weights.npy')
    idf_weights = np.load(f)
    f.close()

    w = w2v()
    w.load_minimal('../data/wiki/model/minimal')

    texts = [
        '../data/wiki/pairs/enwiki_no_pairs_10.txt',
        '../data/wiki/pairs/enwiki_pairs_10.txt'
    ]
    labels = ['Pairs', 'No pairs']
    colors = ['0.75', '0.45']

    # s = 'anarchism is a political philosophy that advocated stateless societies often self governed voluntary institutions but that several authors have defined'.split()
    # sv = []
    # for word in s:
    #     sv.append(w.get_vector(word))
    # for k in xrange(len(sv)):
    #     make_plot_from_vector(sv[k], s[k], 'vector_'+str(k)+'.png')
import gensim
import matplotlib
from w2v import w2v
import metrics
import similarity_plots as sp


f = open('../data/google/model/docfreq.npy')
docfreqs = np.load(f)
f.close()

# f = open('../data/tweets/model/idf_weights.npy')
# idf_weights = np.load(f)
# f.close()

w = w2v()
#w.load_minimal('../data/google/model/minimal')

labels = ['No pairs', 'Pairs']
colors = ['0.75', '0.45']


# tables = ['../data/google/pairs/sets/tfidf_no_pairs_r-validation.npy', '../data/google/pairs/sets/tfidf_pairs_r-validation.npy']
# min, max = sp.calculate_min_max_from_table(tables)
# split = sp.calculate_split_from_table(tables, verbose=False, normalize=(min, max))
# tables = ['../data/google/pairs/sets/tfidf_no_pairs_r-test.npy', '../data/google/pairs/sets/tfidf_pairs_r-test.npy']
# sp.calculate_error_rate_from_table(tables, split, normalize=(min, max))
# sp.calculate_JS_from_table(tables, normalize=(min, max), verbose=True)
#
# tables = ['../data/google/pairs/sets/mean_no_pairs_r-validation.npy', '../data/google/pairs/sets/mean_pairs_r-validation.npy']
# min, max = sp.calculate_min_max_from_table(tables)
示例#11
0
文件: run.py 项目: afcarl/simpleML
'''
Coding Just for Fun
Created by burness on 16/3/6.
'''
from data_utils import *
from w2v import w2v
random.seed(314)
data = StanfordSentiment()
tokens = data.tokens()
nWords = len(tokens)
dimVectors = 10

C = 5
word2vec_model = w2v(data, C=C)

random.seed(2016)
np.random.seed(2016)
wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / dimVectors,
                              np.zeros((nWords, dimVectors))), axis=0)
wordVectors0 = word2vec_model.sgd(lambda vec: word2vec_model.word2vec_sgd_wrapper(word2vec_model.cbow, tokens, vec, data, C, word2vec_model.softmax_cost_grad),
                   wordVectors, 0.3, 2000, None, True, PRINT_EVERY=10)

wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])

print "\n=== For autograder ==="
checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"]
checkIdx = [tokens[word] for word in checkWords]
checkVecs = wordVectors[checkIdx, :]
print checkVecs
def extract_random(max_words_per_text=30,
                   skip=2,
                   n_pairs=5000000,
                   corpus='../data/enwiki.txt',
                   w2v_model='../data/model/minimal',
                   pairs_file='../data/pairs/enwiki_pairs_r.txt',
                   no_pairs_file='../data/pairs/enwiki_no_pairs_r.txt'):

    w = w2v()
    print 'Loading model...'
    w.load_minimal(w2v_model)
    print 'Done.'
    o = open(pairs_file, 'w')
    n = open(no_pairs_file, 'w')

    pool = []
    add1 = 0
    add2 = 10

    current_pair = 0.0

    f = open(corpus, 'r')
    for line in f:
        words = line.split()  #get words of current paragraph

        words_per_text_1 = randint(10, max_words_per_text)
        words_per_text_2 = randint(10, max_words_per_text)

        if len(words) >= words_per_text_1 + words_per_text_2 + (
                words_per_text_1 + words_per_text_2
        ) / 4:  #no of words needs to be sufficiently high
            pair1 = []
            pair2 = []
            while len(pair1) < words_per_text_1 and len(
                    words) > 0:  #add words to first part of pair
                current = words.pop(0)
                if w.exists_word(current):
                    pair1.append(current)
            for s in xrange(skip):
                if len(words) > 0:
                    words.pop(0)
            while len(pair2) < words_per_text_2 and len(
                    words) > 0:  #add words to second part of pair
                current = words.pop(0)
                if w.exists_word(current):
                    pair2.append(current)
            if len(pair1) == words_per_text_1 and len(
                    pair2) == words_per_text_2:
                if add1 == 0:  #add pair1 to the pool
                    pool.append(pair1)
                if add2 == 0:  #add pair2 to the pool
                    pool.append(pair2)
                add1 = (add1 + 1) % 20
                add2 = (add2 + 1) % 20
                o.write(' '.join(pair1) + ';' + ' '.join(pair2) +
                        '\n')  #write pairs to output file
                current_pair += 1.0

        if len(pool) >= 100:  #process 'no pairs'
            print 'Progress %.3f%%' % (100.0 * current_pair / n_pairs)
            for i in xrange(10):
                shuffle(pool)
                for p in xrange(len(pool) - 1):
                    n.write(' '.join(pool[p]) + ';' + ' '.join(pool[p + 1]) +
                            '\n')  #write pairs to output file
            pool = []
            o.flush()
            n.flush()
            if current_pair >= n_pairs:
                break

    f.close()
    o.close()