示例#1
0
def read_counts_matrix(counts_path):
    """
    Reads the counts into a sparse matrix (CSR) from the count-word-context textual format.
    """
    words = load_count_vocabulary(counts_path + '.words.vocab')
    contexts = load_count_vocabulary(counts_path + '.contexts.vocab')
    words = list(words.keys())
    contexts = list(contexts.keys())
    iw = sorted(words)
    ic = sorted(contexts)
    wi = dict([(w, i) for i, w in enumerate(iw)])
    ci = dict([(c, i) for i, c in enumerate(ic)])
    
    counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
    update_threshold = 100000
    i = 0
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            if word in wi and context in ci:
                tmp_counts[wi[word], ci[context]] = int(count)
            i += 1
            if i == update_threshold:
                counts = counts + tmp_counts.tocsr()
                tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
                i = 0
    counts = counts + tmp_counts.tocsr()
    
    return counts, iw, ic
示例#2
0
def read_counts_matrix(counts_path):
    """
    Reads the counts into a sparse matrix (CSR) from the count-word-context textual format.
    """
    words = load_count_vocabulary(counts_path + '.words.vocab')
    contexts = load_count_vocabulary(counts_path + '.contexts.vocab')
    words = list(words.keys())
    contexts = list(contexts.keys())
    iw = sorted(words)
    ic = sorted(contexts)
    wi = dict([(w, i) for i, w in enumerate(iw)])
    ci = dict([(c, i) for i, c in enumerate(ic)])
    
    counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
    update_threshold = 100000
    i = 0
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            if word in wi and context in ci:
                tmp_counts[wi[word], ci[context]] = int(count)
            i += 1
            if i == update_threshold:
                counts = counts + tmp_counts.tocsr()
                tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
                i = 0
    counts = counts + tmp_counts.tocsr()
    
    return counts, iw, ic
示例#3
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --processes_num NUM        The number of processes [default: 12]
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 300]
        --iters NUM                The number of iterations [default: 1]
    """)

    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    processes_num = int(args['--processes_num'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    global_word_count = Value('l', 0)
    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    print()

    for i in range(iters):
        pool = Pool(processes=processes_num,
                    initializer=__init_process,
                    initargs=(w2i, c2i, syn0, syn1, table, negative, size,
                              alpha, processes_num, global_word_count,
                              pairs_num, iters, pairs_path))
        pool.map(train_process, range(processes_num))

    save(i2w, syn0, outputs_path)
    print("word2vecf finished")
示例#4
0
def c2p(args, tid):
    pairs_file = open(args['<pairs>']+"_"+str(tid), 'w')
    threads_num = int(args['--threads_num'])
    subsample = float(args['--sub'])
    sub = subsample != 0
    vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0 #number of (unigram) tokens in corpus
    for w, c in six.iteritems(vocab):
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values()) #number of (ngram) tokens in corpus
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique
    if tid == 0:
        print ('vocabulary size: ' + str(len(vocab)))
    with open(args['<corpus>']) as f:
        line_num = 0
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0 and tid == 0:
                sys.stdout.write("\r" + str(int(line_num/1000)) + "K lines processed.")
                sys.stdout.flush()
            if line_num % threads_num != tid:
                continue
            ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
            # word_word(line, args, vocab, pairs_file, sub, subsampler)
            # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num)
            # word_wordPos(line, args, vocab, pairs_file, sub, subsampler)

    pairs_file.close()
示例#5
0
def c2p(args, tid):
    pairs_file = open(args['<pairs>']+"_"+str(tid), 'w')
    threads_num = int(args['--threads_num'])
    subsample = float(args['--sub'])
    sub = subsample != 0
    vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0 #number of (unigram) tokens in corpus
    for w, c in vocab.iteritems():
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values()) #number of (ngram) tokens in corpus
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in vocab.iteritems() if count > subsample]) #subsampling technique
    if tid == 0:
        print 'vocabulary size: ' + str(len(vocab))
    with open(args['<corpus>']) as f:
        line_num = 0
        if tid == 0:
            print str(line_num/1000**1) + "K lines processed."
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0 and tid == 0:
                print "\x1b[1A" + str(line_num/1000) + "K lines processed."
            if line_num % threads_num != tid:
                continue
            line2features(line, args, vocab, pairs_file, sub, subsampler)

    pairs_file.close()
示例#6
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --processes_num NUM        The number of processes [default: 12]
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 300]
        --iters NUM                The number of iterations [default: 1]
    """)
    
    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    processes_num = int(args['--processes_num'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    global_word_count = Value('l', 0)
    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    print ()

    for i in range(iters):
        pool = Pool(processes=processes_num, initializer=__init_process, initargs=(w2i, c2i, syn0, syn1, table, negative, size, alpha, processes_num, global_word_count, pairs_num, iters, pairs_path))
        pool.map(train_process, range(processes_num))

    save(i2w, syn0, outputs_path)
    print ("word2vecf finished")
示例#7
0
def read_counts_matrxi_fast(counts_path, counts_path_new):
    """
    Reads the counts into a sparse matrix (CSR) from the count-word-context textual format.
    """
    df = pd.read_csv(counts_path_new, sep=" ", names = ["num", "word", "context"],converters =  {"num": np.float32, "word": str, "context": str}, header=None)

    words = load_count_vocabulary(counts_path + '.words.vocab')#this is a dict, contains (word: how many this word appears) pair
    contexts = load_count_vocabulary(counts_path + '.contexts.vocab')

    words = list(words.keys())#is a list contains all the words
    contexts = list(contexts.keys())#is a list contains all the words

    iw = sorted(words)#this is a sorted words list
    ic = sorted(contexts)#this is a sorted context word list 


    wi=pd.Series(index=iw, data=sp.arange(len(iw)))#this should be a dictionary, word: index
    ci=pd.Series(index=ic, data=sp.arange(len(ic)))#this should be a dictionary, context word: index
    
    return csr_matrix((df.num, (wi[df.word], ci[df.context])), [len(iw),len(ic)], dtype=np.float32), list(iw), list(ic)
示例#8
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 100]
        --iters NUM                The number of iterations [default: 1]
    """)

    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    for i in range(iters):
        train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha,
                      negative, pairs_num, iters)
    save(i2w, syn0, outputs_path)
    print("word2vecf finished")
示例#9
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 100]
        --iters NUM                The number of iterations [default: 1]
    """)
    
    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    for i in range(iters):
        train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha, negative, pairs_num, iters)
    save(i2w, syn0, outputs_path)
    print ("word2vecf finished")
示例#10
0
def c2p(args, tid):
    pairs_file = open(args['<pairs>'] + "_" + str(tid), 'w')
    feature = args[
        '--feature']  #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on.
    threads_num = int(args['--threads_num'])
    subsample = float(args['--sub'])
    sub = subsample != 0

    vocab = load_count_vocabulary(
        args['<vocab>'])  #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0  #number of (unigram) tokens in corpus
    for w, c in six.iteritems(vocab):
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values())  #number of (ngram) tokens in corpus
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count))
                           for word, count in six.iteritems(vocab)
                           if count > subsample])  #subsampling technique
    if tid == 0:
        print('vocabulary size: ' + str(len(vocab)))
    with open(args['<corpus>']) as f:
        line_num = 0
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0 and tid == 0:
                sys.stdout.write("\r" + str(int(line_num / 1000)) +
                                 "K lines processed.")
                sys.stdout.flush()
            if line_num % threads_num != tid:
                continue
            if feature == 'ngram-ngram':
                ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
            elif feature == 'word-word':  #identical to word2vec
                word_word(line, args, vocab, pairs_file, sub, subsampler)
            elif feature == 'word-character':  # similar with fasttext
                word_character(line, args, vocab, pairs_file, sub, subsampler)
            else:
                break
            # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num)
            # word_wordPos(line, args, vocab, pairs_file, sub, subsampler)

    pairs_file.close()
示例#11
0
def main():
    args = docopt("""
    Usage:
        corpus2pairs.py [options] <corpus> <vocab> <pairs>

    Options:
        --win NUM                  Window size [default: 2]
        --sub NUM                  Subsampling threshold [default: 0]
        --ngram_word NUM           (Center) word vocabulary includes grams of 1st to nth order [default: 1]
        --ngram_context NUM        Context vocabulary includes grams of 1st to nth order [default: 1]
        --overlap                  Whether overlaping pairs are allowed or not
    """)

    print("**********************")
    print("corpus2pairs")

    pairs_file = open(args['<pairs>'], 'w')
    subsample = float(args['--sub'])
    sub = subsample != 0
    vocab = load_count_vocabulary(
        args['<vocab>'])  #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0  #number of (unigram) tokens in corpus
    for w, c in six.iteritems(vocab):
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values())  #number of (ngram) tokens in corpus
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count))
                           for word, count in six.iteritems(vocab)
                           if count > subsample])  #subsampling technique
    print('vocabulary size: ' + str(len(vocab)))
    with open(args['<corpus>']) as f:
        line_num = 0
        print(str(int(line_num / 1000**1)) + "K lines processed.")
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0:
                print("\x1b[1A" + str(int(line_num / 1000)) +
                      "K lines processed.")
            line2features(line, args, vocab, pairs_file, sub, subsampler)

    pairs_file.close()
    print("corpus2pairs finished")
示例#12
0
def main():
    args = docopt("""
    Usage:
        corpus2pairs.py [options] <corpus> <vocab> <pairs>

    Options:
        --win NUM                  Window size [default: 2]
        --sub NUM                  Subsampling threshold [default: 0]
        --ngram_word NUM           (Center) word vocabulary includes grams of 1st to nth order [default: 1]
        --ngram_context NUM        Context vocabulary includes grams of 1st to nth order [default: 1]
        --overlap                  Whether overlaping pairs are allowed or not
    """)

    print ("**********************")
    print ("corpus2pairs")

    pairs_file = open(args['<pairs>'], 'w')
    subsample = float(args['--sub'])
    sub = subsample != 0
    vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0 #number of (unigram) tokens in corpus
    for w, c in six.iteritems(vocab):
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values()) #number of (ngram) tokens in corpus
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique
    print ('vocabulary size: ' + str(len(vocab)))
    with open(args['<corpus>']) as f:
        line_num = 0
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0:
                sys.stdout.write("\r" + str(int(line_num/1000)) + "K lines processed.")
            line2features(line, args, vocab, pairs_file, sub, subsampler)

    pairs_file.close()
    print ("corpus2pairs finished")
示例#13
0
def c2p(args, tid):
    pairs_file = open(args['<pairs>']+"_"+str(tid), 'w')
    feature = args['--feature'] #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on.
    threads_num = int(args['--threads_num'])
    subsample = float(args['--sub'])
    sub = subsample != 0

    vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0 #number of (unigram) tokens in corpus
    for w, c in six.iteritems(vocab):
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values()) #number of (ngram) tokens in corpus
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique
    if tid == 0:
        print ('vocabulary size: ' + str(len(vocab)))
    with open(args['<corpus>']) as f:
        line_num = 0
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0 and tid == 0:
                sys.stdout.write("\r" + str(int(line_num/1000)) + "K lines processed.")
                sys.stdout.flush()
            if line_num % threads_num != tid:
                continue
            if feature == 'ngram-ngram':
                ngram_ngram(line, args, vocab, pairs_file, sub, subsampler)
            elif feature == 'word-word': #identical to word2vec
                word_word(line, args, vocab, pairs_file, sub, subsampler)
            elif feature == 'word-character': # similar with fasttext
                word_character(line, args, vocab, pairs_file, sub, subsampler)
            else:
                break
            # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num)
            # word_wordPos(line, args, vocab, pairs_file, sub, subsampler)

    pairs_file.close()
示例#14
0
 def load_counts(self, path):
     count_path = path[:path.rfind('/') + 1] + 'counts.words.vocab'
     ng_freqs = load_count_vocabulary(count_path)
     sz = sum(int(v) for v in ng_freqs.values())
     return sz, ng_freqs
示例#15
0
def c2p(args, tid):
    pairs_file = open(args['<pairs>'] + "_" + str(tid), 'w')
    win = int(args['--win'])
    subsample = float(args['--sub'])
    sub = subsample != 0
    ngram_word = int(args['--ngram_word'])
    ngram_context = int(args['--ngram_context'])
    overlap = args['--overlap']
    threads_num = int(args['--threads_num'])

    vocab = load_count_vocabulary(
        args['<vocab>'])  #load vocabulary (generated in corpus2vocab stage)
    train_uni_num = 0  #number of (unigram) tokens in corpus
    for w, c in vocab.iteritems():
        if '@$' not in w:
            train_uni_num += c
    train_num = sum(vocab.values())  #number of (ngram) tokens in corpus
    if tid == 0:
        print 'vocabulary size: ' + str(len(vocab))
        print 'number of training words (uni-grams): ' + str(train_uni_num)
        print 'number of training n-grams: ' + str(train_num)
    subsample *= train_uni_num
    if sub:
        subsampler = dict([(word, 1 - sqrt(subsample / count))
                           for word, count in vocab.iteritems()
                           if count > subsample])  #subsampling technique

    rnd = Random(17)
    with open(args['<corpus>']) as f:
        line_num = 0
        if tid == 0:
            print str(line_num / 1000**1) + "K lines processed."
        for line in f:
            line_num += 1
            if ((line_num) % 1000) == 0 and tid == 0:
                print "\x1b[1A" + str(line_num / 1000) + "K lines processed."
            if line_num % threads_num != tid:
                continue
            tokens = line.strip().split()
            for i in xrange(len(tokens)):  #loop for each position in a line
                for gram_word in xrange(
                        1, ngram_word + 1
                ):  #loop for grams of different orders in (center) word
                    word = getNgram(tokens, i, gram_word)
                    word = check_word(word, vocab, sub, subsampler, rnd)
                    if word is None:
                        continue
                    for gram_context in xrange(
                            1, ngram_context +
                            1):  #loop for grams of different orders in context
                        start = i - win + gram_word - 1
                        end = i + win - gram_context + 1
                        for j in xrange(start, end + 1):
                            if overlap:
                                if i == j and gram_word == gram_context:
                                    continue
                            else:
                                if len(
                                        set(range(i, i + gram_word))
                                        & set(range(j, j + gram_context))) > 0:
                                    continue
                            context = getNgram(tokens, j, gram_context)
                            context = check_word(context, vocab, sub,
                                                 subsampler, rnd)
                            if context is None:
                                continue
                            pairs_file.write(word + ' ' + context +
                                             "\n")  #write pairs to the file
    pairs_file.close()