Exemplo n.º 1
0
def fixWikiMatched(path, wtitles, dfd, N):
    arts = loadDict("dicts/" + path)
    c = 0
    for a in arts:
        print "at", c, "from", path, "proc", os.getpid()
        c += 1
        for m in a["matched"]:
            if wiki.isDisambiguationPage(m):
                print "disambig"
                a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N)
                break

            elif wiki.isMissing(m):
                print "missing"
                a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N)
                break

    print "saving", path
    saveDict(arts, "dicts/fixed_" + path)
Exemplo n.º 2
0
def getNeighbors(artd, wikititles, range=3, BING=True, WIKIBOT=True, params=PARAMS):
    #return titles from the vicinity in the range
    title = artd['name']
    
    #put last name last
    title = ' '.join(artd['name'].split(',')[::-1]).strip()
    #title = reformat(title)
    
    print title

    
    index = bisect.bisect_left(wikititles, title)
    res = set()
    
    for wt in wikititles[index-range:index+range]:
        if 'isambiguation' not in wt:
            res.add(wiki.reformat(wt))
    
    #bing it
    blim1 = params.getNumWordsQ1()
    blim2 = params.getNumWordsQ2()
    if BING:
        firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim1])
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords
        res = addBingLinks(q, res)

        #secnod time with more words
        firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim2])
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords
        res = addBingLinks(q, res)

        #print q
    
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)
        res = addBingLinks(q, res)

    if WIKIBOT:
        res |= set(wiki.queryInterface(title))
        res |= set(wiki.queryInterface('_'.join(
            [title] +
            tools.splitToWords(artd['txt'])[:blim1])))
        

    #print '\n\n', res, '\n\n'
    
    #wiki suggestions
    res = res | set(wiki.getSearchSuggestions(title, limit=5))

    #wiki search
    res = res | set(wiki.search(title))
    
    #print res
    #deal with disambiguation pages
    artd['missing'] = set()
    artd['disambig'] = set()
    for wt in list(res):
        if 'Talk:' in wt:
            res.add(wt.split('Talk:')[1])
            continue
        if ':' in wt or 'List_of' in wt or 'Category:' in wt:
            continue
        if wiki.isMissing(wt, rfmt=lambda x: x):
            res.remove(wt)
            if wt not in artd['missing']:
                artd['missing'].add(wt)
                res |= set(wiki.getSearchSuggestions(wt))
                print 'miss', wt
                continue
        
        dtitles = removeDisambigCandidates(wt)
        if dtitles != None:
            artd['disambig'].add(wt)
            res.remove(wt)
            res |= dtitles

    res = set(map(lambda x: re.sub(' ', '_', x), list(res)))

    for wt in list(res):
        if re.match('\d\d\d*', wt):
            res.remove(wt)
    #print res
    return list(res)