예제 #1
0
def get_persons(path, out):
    f = open(path)
    arts = pickle.load(f)
    f.close()

    res = []

    w = open(out, 'w')
    for art in arts:
        t = art['name']
        q = 'site:wikipedia.org%20'+ re.sub(' \(.*', '', t)
        links = filter(lambda x: 'en.wikipedia.org/wiki/' in x,bing.query(q, 3))
        try:
            candids = map(lambda x: wiki.reformat(x.split('/wiki/')[1]), links)
            person = False
            for c in candids:
                person = person or wiki.isPerson(c)
            
            if person:
                print t
                res.append(art)
                w.write(t)
                w.write('\n')
            else:
                print '\t', t
        except:
            print 'no match', t
            raise
    w.close()
    return res
예제 #2
0
def getNeighbors(artd, wikititles, range=3, BING=True, WIKIBOT=True, params=PARAMS):
    #return titles from the vicinity in the range
    title = artd['name']
    
    #put last name last
    title = ' '.join(artd['name'].split(',')[::-1]).strip()
    #title = reformat(title)
    
    print title

    
    index = bisect.bisect_left(wikititles, title)
    res = set()
    
    for wt in wikititles[index-range:index+range]:
        if 'isambiguation' not in wt:
            res.add(wiki.reformat(wt))
    
    #bing it
    blim1 = params.getNumWordsQ1()
    blim2 = params.getNumWordsQ2()
    if BING:
        firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim1])
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords
        res = addBingLinks(q, res)

        #secnod time with more words
        firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim2])
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords
        res = addBingLinks(q, res)

        #print q
    
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)
        res = addBingLinks(q, res)

    if WIKIBOT:
        res |= set(wiki.queryInterface(title))
        res |= set(wiki.queryInterface('_'.join(
            [title] +
            tools.splitToWords(artd['txt'])[:blim1])))
        

    #print '\n\n', res, '\n\n'
    
    #wiki suggestions
    res = res | set(wiki.getSearchSuggestions(title, limit=5))

    #wiki search
    res = res | set(wiki.search(title))
    
    #print res
    #deal with disambiguation pages
    artd['missing'] = set()
    artd['disambig'] = set()
    for wt in list(res):
        if 'Talk:' in wt:
            res.add(wt.split('Talk:')[1])
            continue
        if ':' in wt or 'List_of' in wt or 'Category:' in wt:
            continue
        if wiki.isMissing(wt, rfmt=lambda x: x):
            res.remove(wt)
            if wt not in artd['missing']:
                artd['missing'].add(wt)
                res |= set(wiki.getSearchSuggestions(wt))
                print 'miss', wt
                continue
        
        dtitles = removeDisambigCandidates(wt)
        if dtitles != None:
            artd['disambig'].add(wt)
            res.remove(wt)
            res |= dtitles

    res = set(map(lambda x: re.sub(' ', '_', x), list(res)))

    for wt in list(res):
        if re.match('\d\d\d*', wt):
            res.remove(wt)
    #print res
    return list(res)