def _counts2PMI(self):

        words = list(self.words.keys())
        contexts = list(self.contexts.keys())
        iw = sorted(words)
        ic = sorted(contexts)
        wi = dict([(w, i) for i, w in enumerate(iw)])
        ci = dict([(c, i) for i, c in enumerate(ic)])

        counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
        tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
        update_threshold = 100000
        i = 0
        with open(self.count_pair_file) as f:
            for line in f:
                count, word, context = line.strip().split()
                if word in wi and context in ci:
                    tmp_counts[wi[word], ci[context]] = int(count)
                i += 1
                if i == update_threshold:
                    counts = counts + tmp_counts.tocsr()
                    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
                    i = 0
        counts = counts + tmp_counts.tocsr()
        pmi = self.calc_pmi(counts, self.cds)

        save_matrix(self.pmi_file, pmi)
        save_vocabulary(self.pmi_file + '.words.vocab', iw)
        save_vocabulary(self.pmi_file + '.contexts.vocab', ic)
        self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg)
        cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path> 
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)

    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])

    #words = load_count_vocabulary(counts_path + '.words.vocab')
    #contexts = load_count_vocabulary(counts_path + '.contexts.vocab')
    #loader = np.load(counts_path+'.pairs.counts.npz')
    #counts = csr_matrix((loader['data'], loader['indices'], loader['indptr']))
    #counts, iw, ic = read_counts_matrix(counts_path)
    #iw = sorted(words)
    #ic = sorted(contexts)

    counts, iw, ic = read_counts_matrix(counts_path)
    pmi = calc_pmi(counts, cds, alpha=1.0)

    #words = load_count_vocabulary(counts_path + '.words.vocab')
    #contexts = load_count_vocabulary(counts_path + '.contexts.vocab')
    #iw = sorted(words)
    #ic = sorted(contexts)

    save_matrix(vectors_path + '.count_matrix', counts)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Пример #3
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)
    
    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])
    
    
    o = open(counts_path + '-new',"w")
    for line in open(counts_path):
        o.write(line.strip()+"\n")
    o.close()
    
    
    counts_path_new = counts_path + '-new'
    
    
    counts, iw, ic = read_counts_matrxi_fast(counts_path, counts_path_new)

    pmi = calc_pmi(counts, cds)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
    
    savePmiNonzeroTerm_fast(counts,vectors_path + '.cooccurrence')
    

    remain_index = pmi.data > 1 

    
    pmi.data = np.log(pmi.data)
    savePmiNonzeroTerm_fast(pmi,vectors_path + '.PMI')
    
    counts.data = counts.data * remain_index
    counts.eliminate_zeros()
    savePmiNonzeroTerm_fast(counts,vectors_path + '.PPMIcooccurrence')

    

    pmi.data[pmi.data < 0] = 0
    pmi.eliminate_zeros()
    
    savePmiNonzeroTerm_fast(pmi,vectors_path + '.PPMI')
Пример #4
0
def main():
    args = docopt("""
    Usage:
        counts2ica.py [options] <counts> <output_path>
    
    Options:
        --cps NUM    Number of ICA components to obtain [default: 50]
    """)

    counts_path = args['<counts>']
    vectors_path = args['<output_path>']

    counts, iw, ic = read_counts_matrix(counts_path)

    embeddings = calc_ica(counts, args['--cps'])

    save_matrix(vectors_path, embeddings)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Пример #5
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)

    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])

    counts, iw, ic = read_counts_matrix(counts_path)

    pmi = calc_pmi(counts, cds)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Пример #6
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)
    
    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])
    
    counts, iw, ic = read_counts_matrix(counts_path)

    pmi = calc_pmi(counts, cds)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Пример #7
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <words_vocab> <contexts_vocab> <counts> <output>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)

    print "**********************"
    print "counts2ppmi"
    counts_path = args['<counts>']
    vectors_path = args['<output>']
    words_path = args['<words_vocab>']
    contexts_path = args['<contexts_vocab>']
    cds = float(args['--cds'])

    counts = read_counts_matrix(words_path, contexts_path, counts_path)
    pmi = calc_pmi(counts, cds)
    save_matrix(vectors_path, pmi)
Пример #8
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <words_vocab> <contexts_vocab> <counts> <output>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)
    
    print "**********************"
    print "counts2ppmi"
    counts_path = args['<counts>']
    vectors_path = args['<output>']
    words_path = args['<words_vocab>']
    contexts_path = args['<contexts_vocab>']
    cds = float(args['--cds'])

    counts = read_counts_matrix(words_path, contexts_path, counts_path)
    pmi = calc_pmi(counts, cds)
    save_matrix(vectors_path, pmi)