예제 #1
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    start = time.time()
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    print("Time elapsed for SVD: %f" % (time.time() - start))

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
예제 #2
0
def get_sim_pair_ppmi(corpus, target_word1, target_word2, year, results_dir):

    results_pair = target_word1 + '-' + target_word2 + '-cosines.tsv'

    embedd = PositiveExplicit.load(corpus + "/" + str(year))

    cos = embedd.similarity(target_word1, target_word2)

    if os.path.isfile(results_dir + results_pair):
        print('file exists')
        with open(results_dir + results_pair) as infile:
            existing_results = infile.read().split('\n')

    else:
        existing_results = []

    with open(results_dir + results_pair, 'a') as outfile:
        result = target_word1 + '-' + target_word2 + '\t' + str(
            year) + '\t' + str(cos) + '\n'
        if result.strip() in existing_results:
            print('result already there')
        else:
            outfile.write(result)

    print(cos)
    def _counts2PMI(self):

        words = list(self.words.keys())
        contexts = list(self.contexts.keys())
        iw = sorted(words)
        ic = sorted(contexts)
        wi = dict([(w, i) for i, w in enumerate(iw)])
        ci = dict([(c, i) for i, c in enumerate(ic)])

        counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
        tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
        update_threshold = 100000
        i = 0
        with open(self.count_pair_file) as f:
            for line in f:
                count, word, context = line.strip().split()
                if word in wi and context in ci:
                    tmp_counts[wi[word], ci[context]] = int(count)
                i += 1
                if i == update_threshold:
                    counts = counts + tmp_counts.tocsr()
                    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
                    i = 0
        counts = counts + tmp_counts.tocsr()
        pmi = self.calc_pmi(counts, self.cds)

        save_matrix(self.pmi_file, pmi)
        save_vocabulary(self.pmi_file + '.words.vocab', iw)
        save_vocabulary(self.pmi_file + '.contexts.vocab', ic)
        self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg)
        cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
예제 #4
0
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    neg = int(args['--neg'])
    w_c = args['--w+c']
    eig = float(args['--eig'])
    
    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return PositiveExplicit(path, True, neg)
        
    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True)
        else:
            return SVDEmbedding(path, True, eig)
    elif rep_type == 'GLOVE':
        return GLOVEEmbedding(path, True)        
    else:
        if w_c:
            return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True)
        else:
            return Embedding(path + '.words', True)
예제 #5
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <repres> <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
        --k NUM [default: 1]
    """)

    repres = args['<repres>']
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    k = int(args['--k'])

    if (repres == "BPMI"):
        explicit = BinExplicit(pmi_path, normalize=False)
    elif (repres == "PMI"):
        explicit = NoExplicit(pmi_path, normalize=False, k=k)
    elif (repres == "NPMI"):
        explicit = NegExplicit(pmi_path, normalize=False)
    else:
        explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
예제 #6
0
def main():
    args = docopt("""
    Usage:
        ppmi2svd.py [options] <ppmi> <output>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 300]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    ppmi_path = args['<ppmi>']
    output_path = args['<output>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(ppmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
예제 #7
0
def folder2chi(folder):
    return PositiveExplicit(join(folder, "chi")).similarity_first_order
예제 #8
0
def get_sim_neighbors_ppmi(corpus, target_word1, target_word2, year1, year2, n, results_dir):


    """Two options: either 2 differnt years and 1 target word
    or the same year and 2 target words"""

    if not os.path.isdir(results_dir+'neighbors'):

        os.mkdir(results_dir+'neighbors')

    results_words = 'neighbors/'+target_word1+'-'+target_word2+'-'+str(year1)+'-'+str(year2)+'.tsv'


    if (year1 != year2) and (target_word1 == target_word2):
        results_cosine = 'cosines-'+target_word1+'-n-'+str(n)+'.tsv'

        embedd_year1 = PositiveExplicit.load(corpus+ "/" + str(year1))
        embedd_year2 = PositiveExplicit.load(corpus+ "/" + str(year2))

        with open(corpus+'/'+str(year1)+'-index.pkl', 'rb') as infile:
            year1_vocab = pickle.load(infile, encoding = 'utf-8')
        with open(corpus+'/'+str(year2)+'-index.pkl', 'rb') as infile:
            year2_vocab = pickle.load(infile, encoding = 'utf-8')

        #year1_vocab = pickle.load(open(corpus+'/'+str(year1)+'-index.pkl'))
        #year2_vocab = pickle.load(open(corpus+'/'+str(year2)+'-index.pkl'))

        if (embedd_year1.represent(target_word1).nnz != 0) and (embedd_year2.represent(target_word1).nnz != 0):

            neighbors_year1 = get_nearest_neighbors(embedd_year1, target_word1, n)
            neighbors_year2 = get_nearest_neighbors(embedd_year2, target_word1, n)


            union = get_union(neighbors_year1, neighbors_year2)

            filtered_union = filter_union(union, embedd_year1, embedd_year2, target_word1)

            #clean_union = []

            #for word in union:
            #    if (word in year1_vocab) and (word in year2_vocab):
            #        clean_union.append(word)

            vec1 = get_second_order_vector(embedd_year1, filtered_union, target_word1)
            vec2 = get_second_order_vector(embedd_year2, filtered_union, target_word1)
            #vec1, vec2 = filter_so_vector_for_nans(embedd_year1, embedd_year2, union, target_word1)

            neighbor_words1 = get_nearest_neighbor_words(neighbors_year1)
            neighbor_words2 = get_nearest_neighbor_words(neighbors_year2)

            cos = get_cosine(vec1, vec2)
        else:
            print('word out of vocab')
            cos = 'OOV'
            neighbor_words1 = ['OOV']
            neighbor_words2 = ['OOV']



    elif (year1 == year2) and (target_word1 != target_word2):
        results_cosine = 'cosines-'+target_word1+'-'+target_word2+'-n-'+str(n)+'.tsv'


        embedd_year = PositiveExplicit.load(corpus+ "/" + str(year1))

        if (embedd_year.represent(target_word1).nnz) != 0 and (embedd_year.represent(target_word2).nnz != 0):

            neighbors_word1 = get_nearest_neighbors(embedd_year, target_word1, n)
            neighbors_word2 = get_nearest_neighbors(embedd_year, target_word2, n)

            union = get_union(neighbors_word1, neighbors_word2)

            vec1 = get_second_order_vector(embedd_year, union, target_word1)
            vec2 = get_second_order_vector(embedd_year, union, target_word2)

            neighbor_words1 = get_nearest_neighbor_words(neighbors_word1)
            neighbor_words2 = get_nearest_neighbor_words(neighbors_word2)

            cos = get_cosine(vec1, vec2)
        else:
            print('word out of vocab')
            cos = 'OOV'
            neighbor_words1 = ['OOV']
            neighbor_words2 = ['OOV']

    if os.path.isfile(results_dir+results_cosine):
        print('file exists')
        with open(results_dir+results_cosine) as infile:
            existing_results = infile.read().split('\n')

    else:
        existing_results = []

    with open(results_dir+results_words, 'w') as outfile1:
        for word1, word2 in zip(neighbor_words1, neighbor_words2):
            #outfile1.write(word1.encode('utf-8')+'\t'+word2.encode('utf-8')+'\n')
            outfile1.write(word1+'\t'+word2+'\n')

    with open(results_dir+'/'+results_cosine, 'a') as outfile2:
        result = target_word1+'-'+target_word2+'\t'+str(year1)+'-'+str(year2)+'\t'+str(cos)+'\n'
        if result.strip() in existing_results:
            print('result already there')
        else:
            outfile2.write(result)
    print(cos)