예제 #1
0
def properFilterArts(arts, params=PARAMS):
    
    #iF year and title - its matched
    #iF no year - find first with year and title if there is one.
    #iF diff greater than 80 then it's bad
    
    res = []
    c = 0
    MAX_DIFF = 80
    
    for a in arts:
        matched = []
        first_name, first_score = a['candids'][0]
        if wiki.isDisambiguationPage(first_name):
                print name, 'dis'
                
        else:
            first_sum = sum(first_score.values())
            if first_score['years'] > 0 and first_score['titles'] > 0:
                matched = [wiki.getRedirect(first_name)]
                a['matched'] = matched
                res.append(a)
                continue

        #not first match        
        for name, score in a['candids'][1:]:
            if wiki.isDisambiguationPage(name):
                print name, 'dis'
                continue
            
            if score['years'] > 0 and score['titles'] > 0:
                asum = sum(score.values())
                if first_sum - asum < MAX_DIFF:
                    a['matched'] = [wiki.getRedirect(name)]
                    res.append(a)
                    break
                else:
                    print a['name'],'DIFFBIG:',asum,name
                    break
    return res
예제 #2
0
def fixWikiMatched(path, wtitles, dfd, N):
    arts = loadDict("dicts/" + path)
    c = 0
    for a in arts:
        print "at", c, "from", path, "proc", os.getpid()
        c += 1
        for m in a["matched"]:
            if wiki.isDisambiguationPage(m):
                print "disambig"
                a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N)
                break

            elif wiki.isMissing(m):
                print "missing"
                a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N)
                break

    print "saving", path
    saveDict(arts, "dicts/fixed_" + path)
예제 #3
0
def removeDisambigCandidates(wt):
    if wiki.isDisambiguationPage(wt):
        print 'disambig', wt
        dtitles = wiki.getLinks(wt)
        return set(dtitles)