Пример #1
def evalsemrel(l1, l2, pairsfile, l1colnum, l2colnum, l2gpmffile,
    '''Evaluate semantic relatedness.
    l1 and l2 = languages s and t in p(t|s)
    pairsfile = translation pairs in l1 and l2, l1 words in column <colnum>
    l2gpmffile = gold pmf over l2 words, for l2 words (including those in pairsfile)
    l1l2methpmffile = method-induced pmf over l2 words, for l1 words (including those
                      in pairsfile)
    l2gpmf = L1L2PMF(l2, l2, l2gpmffile)
    l1l2pmf = L1L2PMF(l1, l2, l1l2methpmffile)
    #    print "#JSDiv Spearmanr"
    jsds, rhos = [], []
    for line in open(pairsfile):
        line = line.decode('utf-8').rstrip()
        pair = line.split()
        w1, w2 = pair[l1colnum - 1], pair[l2colnum - 1]
        gpmf = l2gpmf.pmf[w2]  #gold pmf
        mpmf = l1l2pmf.pmf[w1]  #method pmf
        vecs = [(gpmf[x2], mpmf[x2]) for x2 in gpmf if x2 in mpmf]
        vecs.extend([(gpmf[x2], 0.0) for x2 in gpmf if x2 not in mpmf])
        vecs.extend([(0.0, mpmf[x2]) for x2 in mpmf if x2 not in gpmf])
        gvec, mvec = zip(*vecs)
        jsd = MyUtils.jsd(gvec, mvec, base=2)
        rho, pval = mstats.spearmanr(gvec, mvec, use_ties=True)
        print "%f\t%f\t%f" % (jsd, rho, pval)
    print "\t\t\t%f\t%f" % (sum(jsds) / len(jsds), sum(rhos) / len(rhos))
Пример #2
 def __init__(self, sl, tl, al, at_pmffile, sa_pmffile, at_tvocfile,
     '''Input: source lang, target lang, auxiliary lang, p(t|a), p(a|s), |Vat_t|, |Vas_a|.
     self.sl, self.tl, self.al = sl, tl, al
     self.PtGa = L1L2PMF(al, tl, at_pmffile)  #p(t|a)
     self.PaGs = L1L2PMF(sl, al, sa_pmffile)  #p(a|s)
     # vocab size of target in at_pmf
     self.Vat_t = set(open(at_tvocfile).read().decode('utf-8').split())
     self.OneByVat_t = 1 / float(len(self.Vat_t))
     self.Vas_a = set(open(as_avocfile).read().decode('utf-8').split())
     self.OneByVas_a = 1 / float(len(self.Vas_a))
Пример #3
def getpmmc(sl, tl, slvecfile, tlvocfile, st_pmffiles, K,
           tr_ts_cand_par_filelist, truncprob):
    mmc = MixModelClus(sl, tl, slvecfile, st_pmffiles.split(','))
    tlvoc = open(tlvocfile).read().decode('utf-8').split()
    for line in open(tr_ts_cand_par_filelist):
        line = line.rstrip()
        tsfile, colnum, trfile, l1, l2, candfile, paramfile = line.split()
        # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora
        #          (so that impact of bringing in `a' can be measured accurately)
        # tsfile = should not contain words present in trfile
        colnum = int(colnum) # column in tsfile, from which to read test words

        mmc.settrdata(trfile, l1, l2)

        f = gzip.open(candfile, 'wb')
        for s in [line.decode('utf-8').strip().split()[colnum-1]
                  for line in open(tsfile).readlines()]:
    #        start = time.clock()
            cands = [(t, mmc.pmm(t,s)) for t in tlvoc]
            cands.sort(key=lambda x: x[1], reverse=True)
            cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
            line = s + u'\t' + u'\t'.join( [t+u' '+unicode(pr) for t, pr in cands] )
            print >> f, line.encode('utf-8')
    #        print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands))
Пример #4
def getpmmc(sl, tl, slvecfile, tlvocfile, st_pmffiles, K,
            tr_ts_cand_par_filelist, truncprob):
    mmc = MixModelClus(sl, tl, slvecfile, st_pmffiles.split(','))
    tlvoc = open(tlvocfile).read().decode('utf-8').split()
    for line in open(tr_ts_cand_par_filelist):
        line = line.rstrip()
        tsfile, colnum, trfile, l1, l2, candfile, paramfile = line.split()
        # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora
        #          (so that impact of bringing in `a' can be measured accurately)
        # tsfile = should not contain words present in trfile
        colnum = int(colnum)  # column in tsfile, from which to read test words

        mmc.settrdata(trfile, l1, l2)

        f = gzip.open(candfile, 'wb')
        for s in [
                line.decode('utf-8').strip().split()[colnum - 1]
                for line in open(tsfile).readlines()
            #        start = time.clock()
            cands = [(t, mmc.pmm(t, s)) for t in tlvoc]
            cands.sort(key=lambda x: x[1], reverse=True)
            cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
            line = s + u'\t' + u'\t'.join(
                [t + u' ' + unicode(pr) for t, pr in cands])
            print >> f, line.encode('utf-8')
    #        print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands))
Пример #5
def getpmm4eps(sl, tl, tlvocfile, st_pmffiles, epsfile,
               tr_ts_cand_lat_filelist, tr_or_ts, truncprob):
    #    mm = MixModel(sl, tl, [st_pmffile, sta_pmffile])
    # st_pmffiles = comma separated list of pmf files, starting with
    #               the base dist, followed by aux lang dist
    mm = MixModel(sl, tl, st_pmffiles.split(','))

    tlvoc = open(tlvocfile).read().decode('utf-8').split()
    for line in open(tr_ts_cand_lat_filelist):
        line = line.rstrip()
        tsfile, colnum, trfile, l1, l2, candfile, epsfile1 = line.split()
        # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora
        #          (so that impact of bringing in a can be measured accurately)
        # tsfile = should not contain words present in trfile
        colnum = int(colnum)

        f = gzip.open(candfile, 'wb')
        qfile = tsfile if tr_or_ts == "TEST" else trfile
        for s in [
                line.decode('utf-8').strip().split()[colnum - 1]
                for line in open(qfile).readlines()
            cands = [(t, mm.pmm(t, s)) for t in tlvoc]
            cands.sort(key=lambda x: x[1], reverse=True)
            cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
            line = s + u'\t' + u'\t'.join(
                [t + u' ' + unicode(pr) for t, pr in cands])
            print >> f, line.encode('utf-8')
Пример #6
def getpmmfine(sl, tl, tlvocfile, st_pmffile, sta_pmffile,
               tr_ts_cand_lat_filelist, st_ptmfile, truncprob):
    stptm = PTM.load(st_ptmfile)
    scd = SourceCatDist()
    scd.set_from_PTM(stptm, sl)
    mmf = MixModelFine(sl, tl, [st_pmffile, sta_pmffile], scd)
    tlvoc = open(tlvocfile).read().decode('utf-8').split()
    for line in open(tr_ts_cand_lat_filelist):
        line = line.rstrip()
        tsfile, colnum, trfile, l1, l2, candfile, latentfile = line.split()
        # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora
        #          (so that impact of bringing in a can be measured accurately)
        # tsfile = should not contain words present in trfile
        colnum = int(colnum)

        mmf.settrdata(trfile, l1, l2)

        f = gzip.open(candfile, 'wb')

        for s in [
                line.decode('utf-8').strip().split()[colnum - 1]
                for line in open(tsfile).readlines()
            #        start = time.clock()
            cands = [(t, mmf.pmm(t, s)) for t in tlvoc]
            cands.sort(key=lambda x: x[1], reverse=True)
            cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
            line = s + u'\t' + u'\t'.join(
                [t + u' ' + unicode(pr) for t, pr in cands])
            print >> f, line.encode('utf-8')
    #        print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands))
Пример #7
def savecands(l1, qfile, l2, meth, truncprob, candfile):
    f = gzip.open(candfile, 'wb')
    for w1 in open(qfile).read().decode('utf-8').split():
        cands = meth.get_similar(l1, w1, l2)
        # convert to pmf
        tot = sum([sc for w, sc in cands])
        cands = [(w, sc / tot) for w, sc in cands]
        # truncate and renormalize pmf
        cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
        line = w1 + '\t' + '\t'.join([w2 + ' ' + str(sc) for w2, sc in cands])
        print >> f, line.encode('utf-8')
Пример #8
def savecands(l1, qfile, l2, meth, truncprob, candfile):
    f = gzip.open(candfile, 'wb')
    for w1 in open(qfile).read().decode('utf-8').split():
        cands = meth.get_similar(l1, w1, l2)
        # convert to pmf
        tot = sum([sc for w, sc in cands])
        cands = [(w, sc/tot) for w, sc in cands]
        # truncate and renormalize pmf
        cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
        line = w1 + '\t' + '\t'.join( [w2+' '+str(sc) for w2, sc in cands] )
        print >> f, line.encode('utf-8')
Пример #9
    def SRptm(l1, l2, l1l2pairsfile, l1l2candfile, l2l1candfile):
        Input: p(t|s), p(s|t), {(s,t)} pairs from human annotation task
        Output: {(s,t,score)} for all the pairs
            For each s,t pair
                Get distributions p(t'|s) and p(s'|t).
                l-INF normalize the distributions to get r(t'|s) and r(s'|t)
                Take the max of r(t|s) and r(s|t); this is the score.
        l1l2, l2l1 = L1L2PMF(l1, l2, l1l2candfile), L1L2PMF(l2, l1, l2l1candfile)
        for line in open(l1l2pairsfile):
            line = line.decode('utf-8').rstrip()
            w1, w2 = line.split()
            if w2 in l1l2.pmf[w1]:
#                rl1l2 = l1l2.pmf[w1][w2] / max( l1l2.pmf[w1].itervalues() )
                rl1l2 = l1l2.pmf[w1][w2]
            else: rl1l2 = 0
            if w1 in l2l1.pmf[w2]:
#                rl2l1 = l2l1.pmf[w2][w1] / max( l2l1.pmf[w2].itervalues() )
                rl2l1 = l2l1.pmf[w2][w1]
            else: rl2l1 = 0
            score = max(rl1l2, rl2l1)
            print (w1+'\t'+w2+'\t'+unicode(score)).encode('utf-8')
Пример #10
def getpaux(sl, qfile, tl, tlvocfile, al, at_pmffile, sa_pmffile, at_tvocfile,
            as_avocfile, truncprob, candfile):
    pa = Paux(sl, tl, al, at_pmffile, sa_pmffile, at_tvocfile, as_avocfile)
    tlvoc = open(tlvocfile).read().decode('utf-8').split()
    f = gzip.open(candfile, 'wb')
    for s in open(qfile).read().decode('utf-8').split():
        if s in pa.PaGs.pmf:  #ignore source words without data
            #            start = time.clock()
            cands = [(t, pa.p_a(t, s)) for t in tlvoc]
            cands.sort(key=lambda x: x[1], reverse=True)
            cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
            line = s + '\t' + '\t'.join([t + ' ' + str(pr) for t, pr in cands])
            print >> f, line.encode('utf-8')
#            print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands))
Пример #11
def getcands4meth(qfile, colnum, candfile, truncprob, getrelmeth):
    f = gzip.open(candfile, 'wb')
    for line in open(qfile):
        line = line.decode('utf-8').rstrip()
        w1 = line.split()[colnum-1]
        cands = getrelmeth(w1)
        # convert to pmf
        minsc = min([sc for w, sc in cands])
        cands = [(w, sc-minsc) for w, sc in cands] # shift all scores to make them positive
        tot = sum([sc for w, sc in cands])
        cands = [(w, sc/tot) for w, sc in cands]
        # truncate and renormalize pmf
        cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
        line = w1 + u'\t' + u'\t'.join( [w+u' '+unicode(sc) for w, sc in cands] )
        print >> f, line.encode('utf-8')
Пример #12
 def __init__(self, sl, tl, st_pmffiles):
     '''Input: source lang, target lang, p(t|s) pmf file names, training lexicon'''
     self.sl, self.tl = sl, tl
     self.beta = []  # p(t|s) pmfs for different experts
     self.tvocs = []  # target vocabularies for different experts
     self.OneByVt = []  # 1/V_t for different experts
     for fil in st_pmffiles:
         self.beta.append(L1L2PMF(sl, tl, fil).pmf)
                 t for s in self.beta[-1].iterkeys()
                 for t in self.beta[-1][s].iterkeys()
         self.OneByVt.append(1.0 / len(self.tvocs[-1]))
     self.E = len(self.beta)  # number of experts
     self.eps = [1.0 / self.E
                 for i in range(self.E)]  # default expert mixture
Пример #13
def getcands4meth(qfile, colnum, candfile, truncprob, getrelmeth):
    f = gzip.open(candfile, 'wb')
    for line in open(qfile):
        line = line.decode('utf-8').rstrip()
        w1 = line.split()[colnum - 1]
        cands = getrelmeth(w1)
        # convert to pmf
        minsc = min([sc for w, sc in cands])
        cands = [(w, sc - minsc)
                 for w, sc in cands]  # shift all scores to make them positive
        tot = sum([sc for w, sc in cands])
        cands = [(w, sc / tot) for w, sc in cands]
        # truncate and renormalize pmf
        cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
        line = w1 + u'\t' + u'\t'.join(
            [w + u' ' + unicode(sc) for w, sc in cands])
        print >> f, line.encode('utf-8')
Пример #14
 def __init__(self, sl, tl, slvecfile, st_pmffiles):
     '''Input: source lang, target lang, source word feature vectors,
      p(t|s) pmf file names, and number of clusters K.'''
     self.sl, self.tl = sl, tl
     self.slfeat = WordFeat.loadfeat(slvecfile)
     self.slfeat = self.scalefeat(self.slfeat)
     self.beta = []  # p(t|s) pmfs for different experts
     self.tvocs = []  # target vocabularies for different experts
     self.OneByVt = []  # 1/V_t for different experts
     for fil in st_pmffiles:
         self.beta.append(L1L2PMF(sl, tl, fil).pmf)
                 t for s in self.beta[-1].iterkeys()
                 for t in self.beta[-1][s].iterkeys()
         self.OneByVt.append(1.0 / len(self.tvocs[-1]))
     self.E = len(self.beta)  # number of experts
Пример #15
 def __init__(self, sl, tl, st_pmffiles, scd):
     '''Input: source lang, target lang, p(t|s) pmf file names,
     source categ dist object'''
     self.sl, self.tl = sl, tl
     self.beta = []  # p(t|s) pmfs for different experts
     self.tvocs = []  # target vocabularies for different experts
     self.OneByVt = []  # 1/V_t for different experts
     for fil in st_pmffiles:
         self.beta.append(L1L2PMF(sl, tl, fil).pmf)
                 t for s in self.beta[-1].iterkeys()
                 for t in self.beta[-1][s].iterkeys()
         self.OneByVt.append(1.0 / len(self.tvocs[-1]))
     self.E = len(self.beta)  # number of experts
     self.scd = scd
     self.T = len(scd.phi)  # number of categories
Пример #16
def getpmm(sl, tl, tlvocfile, st_pmffiles, tr_ts_cand_lat_filelist, truncprob,
    #    mm = MixModel(sl, tl, [st_pmffile, sta_pmffile])
    # st_pmffiles = comma separated list of pmf files, starting with
    #               the base dist, followed by aux lang dist
    mm = MixModel(sl, tl, st_pmffiles.split(','))

    tlvoc = open(tlvocfile).read().decode('utf-8').split()
    for line in open(tr_ts_cand_lat_filelist):
        line = line.rstrip()
        tsfile, colnum, trfile, l1, l2, candfile, epsfile = line.split()
        # trfile = (s,t) pairs such that they belong to vocab of st, at, and as corpora
        #          (so that impact of bringing in a can be measured accurately)
        # tsfile = should not contain words present in trfile
        colnum = int(colnum)

        mm.settrdata(trfile, l1, l2)
        if learnmeth == "EM":
        elif learnmeth == "GRID":

        f = gzip.open(candfile, 'wb')

        for s in [
                line.decode('utf-8').strip().split()[colnum - 1]
                for line in open(tsfile).readlines()
            #        start = time.clock()
            cands = [(t, mm.pmm(t, s)) for t in tlvoc]
            cands.sort(key=lambda x: x[1], reverse=True)
            cands = L1L2PMF.truncate_pmf(cands, truncprob, renormalize=True)
            line = s + u'\t' + u'\t'.join(
                [t + u' ' + unicode(pr) for t, pr in cands])
            print >> f, line.encode('utf-8')
    #        print >> sys.stderr, "%d secs, %d words" % (time.clock() - start, len(cands))