Exemplo n.º 1
0
def get_results(testing, m):

    for pair in testing:
        gold_paras=[]
        for p in pair.paraphrases:
            gold_paras.append(p)
    
        if len(gold_paras)>2:
            subs=random.sample(gold_paras,3)
        else:
            errcount+=1
            print "List too short error."
            continue
        base=[]

        for t in totals:
            if Paraphrase(t[0]) not in subs:
                base.append(Paraphrase(t[0]))
            if len(base)==m: break
        # a list of all paraphrases, to be ordered by score for this compound
        results=[]
        
        for p in probs.keys():
            x=Paraphrase(p.strip())
            x.score=0.0
            #the seed paraphrases are not allowed in predictions
            if not x in subs: results.append(x)
            
        for p in results:
            for s in subs:
                try:
                    p.score+=probs[p.name][s.name]
                    nonerrcount+=1
                    #print "done"
                except KeyError:
                    errcount+=1
                    #print errcount
                    #print "Key Error"
        results.sort(key= lambda para: para.score, reverse=True)
        score=0.0
        basescore=0.0
        for b in base[0:m]:
            if b in gold_paras:basescore+=1.0    
        for r in results[0:m]:
            if r in gold_paras:score+=1.0
        total+=(score/float(m))
        basetotal+=(basescore/float(m))
    acc=total/len(testing)
    print "predictions:"
    print total/len(testing)
    print
    baseacc=basetotal/len(testing)
    print "baseline:"
    print basetotal/len(testing)
    
    print errcount
    print nonerrcount
    results=[acc,baseacc]
    return results
Exemplo n.º 2
0
def get_results(training,testing, m):
    w=Web1TSearch("/media/Iomega HDD/web1T/clean/")
    #w=Web1TSearch("/media/usb0/web1T/clean/")
    print "bulding probability table..."
    priors=make_priors(training)
    probs=make_prob_table(training, priors)
    count=0
    print "done."
    total=0.0
    basetotal=0.0
    errcount=0
    nonerrcount=0
    #baseline of most frequent  overall paraphrases
    totals=sorted(priors.items(), key=lambda x: x[1], reverse=True)
    for pair in testing:
        count+=1
        print count
        print "\n\n*************************************\n\n"
        gold_paras=[]
        for p in pair.paraphrases:
            gold_paras.append(p)
        subs=[]
        print pair.n2 + " " + pair.n1
        r= w.getNgrams(pair.n2,pair.n1)
        r= w.reducePats(r,pair.n2,pair.n1)
        sortedResults=sorted(r.iteritems(), key=lambda (k,v): (v,k),reverse=True)
        for s in sortedResults:
            p=Paraphrase(s[0].replace('_',' ') )
            if p.name in priors.keys():
                subs.append(p)
                print p.name
            if "be "+ p.name in priors.keys():
                subs.append(Paraphrase("be "+p.name))
                print p.name
            
        base=[]

        for t in totals:
            if Paraphrase(t[0]) not in subs:
                base.append(Paraphrase(t[0]))
            if len(base)==m: break
        # a list of all paraphrases, to be ordered by score for this compound
        results=[]
        
        for p in probs.keys():
            x=Paraphrase(p.strip())
            x.score=0.0
            #the seed paraphrases are not allowed in predictions
            if not x in subs: results.append(x)
        
        for p in results:


            for s in subs:
                try:
                    p.score+=probs[p.name][s.name]
                    nonerrcount+=1
                    #print "done"
                except KeyError:
                    errcount+=1
                    #print errcount
                    #print "Key Error"
        results.sort(key= lambda para: para.score, reverse=True)
        
        if len(subs)==0: results=copy.copy(base)
        print
        print "Gold:"
        for g in gold_paras: print g.name
        print
        
        print "Seeds"
        for s in subs: print s.name
        print
        
        print "Predictions: "
        for p in results[0:m]:print p.name
        print
        
        print "Baseline:"
        for b in base: print b.name
        print 
        
        
        score=0.0
        basescore=0.0
        for b in base[0:m]:
            if b in gold_paras:basescore+=1.0    
        for r in results[0:m]:
            if r in gold_paras:score+=1.0
        total+=(score/float(m))
        basetotal+=(basescore/float(m))
    acc=total/len(testing)
    print "predictions:"
    print total/len(testing)
    print
    baseacc=basetotal/len(testing)
    print "baseline:"
    print basetotal/len(testing)
    
    print errcount
    print nonerrcount
    results=[acc,baseacc]
    return results
Exemplo n.º 3
0
     inp=raw_input()
     if inp in priors:
         seeds.append(Paraphrase(inp))
     else:
         print "not found"
 
 print "working..."
 for pair in all_pairs:
     paras=[]
     for p in pair.paraphrases:
         if p.freq < n : continue
         paras.append(p)
     results=[]
     
     for p in probs.keys():
         x=Paraphrase(p.strip())
         x.score=0.0
         #the seed paraphrases are not allowed in predictions
         if not x in seeds: results.append(x)
         
     for p in results:
         for s in seeds:
             try:
                 p.score+=probs[p.name][s.name]
                 #print "done"
             except KeyError:
                 pass
                 print "Key Error"
     results.sort(key= lambda para: para.score, reverse=True)
 
 for r in results[0:10]:
Exemplo n.º 4
0
def get_results(training,testing, m):
    print "bulding probability table..."
    priors=make_priors_freq(training)
    probs=make_prob_table(training, priors)
    print "done."
    total=0.0
    basetotal=0.0
    rand_basetotal=0.0
    errcount=0
    nonerrcount=0
    #baseline of most frequent  overall paraphrases
    totals=sorted(priors.items(), key=lambda x: x[1], reverse=True)
    for pair in testing:
        gold_paras=[]
        for p in pair.paraphrases:
            gold_paras.append(p)
    
        if len(gold_paras)>2:
            subs=random.sample(gold_paras,3)
        else:
            errcount+=1
            print "List too short error."
            continue
        

        base=[]
        for t in totals:
            if Paraphrase(t[0]) not in subs:
                base.append(Paraphrase(t[0]))
            if len(base)==m: break
            
        rand_base=[]
        i=0
        while(i<3):
            p=Paraphrase(random.choice(priors.keys()))
            if p not in subs:
                rand_base.append(p)
                i+=1
                
        for t in totals:
            if Paraphrase(t[0]) not in subs:
                base.append(Paraphrase(t[0]))
            if len(base)==m: break
            
        # a list of all paraphrases, to be ordered by score for this compound
        results=[]
        
        for p in probs.keys():
            x=Paraphrase(p.strip())
            x.score=0.0
            #the seed paraphrases are not allowed in predictions
            if not x in subs: results.append(x)
            
        for p in results:
            p.score=priors[p.name]
            for s in subs:
                try:
                    p.score=p.score*probs[p.name][s.name]
                    nonerrcount+=1
                    #print "done"
                except KeyError:
                    errcount+=1
                    #print errcount
                    #print "Key Error"
        results.sort(key= lambda para: para.score, reverse=True)
        score=0.0
        basescore=0.0
        rand_basescore=0.0
        for p in rand_base[0:m]:
            if p in gold_paras:rand_basescore+=1.0   
        for b in base[0:m]:
            if b in gold_paras:basescore+=1.0    
        for r in results[0:m]:
            if r in gold_paras:score+=1.0
        total+=(score/float(m))
        basetotal+=(basescore/float(m))
        rand_basetotal+=(rand_basescore/float(m))
    acc=total/len(testing)
    print "predictions:"
    print total/len(testing)
    print
    baseacc=basetotal/len(testing)
    print "most frequent baseline:"
    print basetotal/len(testing)
    
    rand_baseacc=rand_basetotal/len(testing)
    print "random baseline:"
    print rand_basetotal/len(testing)
    
    print errcount
    print nonerrcount
    results=[acc,baseacc, rand_baseacc]
    return results