Пример #1
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    start = time.time()
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    print("Time elapsed for SVD: %f" % (time.time() - start))

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
    def _counts2PMI(self):

        words = list(self.words.keys())
        contexts = list(self.contexts.keys())
        iw = sorted(words)
        ic = sorted(contexts)
        wi = dict([(w, i) for i, w in enumerate(iw)])
        ci = dict([(c, i) for i, c in enumerate(ic)])

        counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
        tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
        update_threshold = 100000
        i = 0
        with open(self.count_pair_file) as f:
            for line in f:
                count, word, context = line.strip().split()
                if word in wi and context in ci:
                    tmp_counts[wi[word], ci[context]] = int(count)
                i += 1
                if i == update_threshold:
                    counts = counts + tmp_counts.tocsr()
                    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
                    i = 0
        counts = counts + tmp_counts.tocsr()
        pmi = self.calc_pmi(counts, self.cds)

        save_matrix(self.pmi_file, pmi)
        save_vocabulary(self.pmi_file + '.words.vocab', iw)
        save_vocabulary(self.pmi_file + '.contexts.vocab', ic)
        self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg)
        cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
Пример #3
0
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    neg = int(args['--neg'])
    w_c = args['--w+c']
    eig = float(args['--eig'])
    
    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return PositiveExplicit(path, True, neg)
        
    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True)
        else:
            return SVDEmbedding(path, True, eig)
    elif rep_type == 'GLOVE':
        return GLOVEEmbedding(path, True)        
    else:
        if w_c:
            return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True)
        else:
            return Embedding(path + '.words', True)
Пример #4
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <repres> <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
        --k NUM [default: 1]
    """)

    repres = args['<repres>']
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    k = int(args['--k'])

    if (repres == "BPMI"):
        explicit = BinExplicit(pmi_path, normalize=False)
    elif (repres == "PMI"):
        explicit = NoExplicit(pmi_path, normalize=False, k=k)
    elif (repres == "NPMI"):
        explicit = NegExplicit(pmi_path, normalize=False)
    else:
        explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
Пример #5
0
def main():
    args = docopt("""
    Usage:
        ppmi2svd.py [options] <ppmi> <output>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 300]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    ppmi_path = args['<ppmi>']
    output_path = args['<output>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(ppmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
Пример #6
0
def folder2chi(folder):
    return PositiveExplicit(join(folder, "chi")).similarity_first_order