def get_persons(path, out): f = open(path) arts = pickle.load(f) f.close() res = [] w = open(out, 'w') for art in arts: t = art['name'] q = 'site:wikipedia.org%20'+ re.sub(' \(.*', '', t) links = filter(lambda x: 'en.wikipedia.org/wiki/' in x,bing.query(q, 3)) try: candids = map(lambda x: wiki.reformat(x.split('/wiki/')[1]), links) person = False for c in candids: person = person or wiki.isPerson(c) if person: print t res.append(art) w.write(t) w.write('\n') else: print '\t', t except: print 'no match', t raise w.close() return res
def getNeighbors(artd, wikititles, range=3, BING=True, WIKIBOT=True, params=PARAMS): #return titles from the vicinity in the range title = artd['name'] #put last name last title = ' '.join(artd['name'].split(',')[::-1]).strip() #title = reformat(title) print title index = bisect.bisect_left(wikititles, title) res = set() for wt in wikititles[index-range:index+range]: if 'isambiguation' not in wt: res.add(wiki.reformat(wt)) #bing it blim1 = params.getNumWordsQ1() blim2 = params.getNumWordsQ2() if BING: firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim1]) q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords res = addBingLinks(q, res) #secnod time with more words firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim2]) q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords res = addBingLinks(q, res) #print q q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title) res = addBingLinks(q, res) if WIKIBOT: res |= set(wiki.queryInterface(title)) res |= set(wiki.queryInterface('_'.join( [title] + tools.splitToWords(artd['txt'])[:blim1]))) #print '\n\n', res, '\n\n' #wiki suggestions res = res | set(wiki.getSearchSuggestions(title, limit=5)) #wiki search res = res | set(wiki.search(title)) #print res #deal with disambiguation pages artd['missing'] = set() artd['disambig'] = set() for wt in list(res): if 'Talk:' in wt: res.add(wt.split('Talk:')[1]) continue if ':' in wt or 'List_of' in wt or 'Category:' in wt: continue if wiki.isMissing(wt, rfmt=lambda x: x): res.remove(wt) if wt not in artd['missing']: artd['missing'].add(wt) res |= set(wiki.getSearchSuggestions(wt)) print 'miss', wt continue dtitles = removeDisambigCandidates(wt) if dtitles != None: artd['disambig'].add(wt) res.remove(wt) res |= dtitles res = set(map(lambda x: re.sub(' ', '_', x), list(res))) for wt in list(res): if re.match('\d\d\d*', wt): res.remove(wt) #print res return list(res)