예제 #1
0
def getCombinedScores(arttxt, title_txts, comp, occp, wt=True, vt=True):
    wordsa = map(lambda x: x.lower(), tools.splitToWords(arttxt))

    global NAME

    res = []
    for name, text in title_txts:
        NAME = name
        score = 0

        wordsb = map(lambda x: x.lower(), tools.splitToWords(text))
        score1 = comp(wordsb, wordsa) #reverse

        score2 =  getKeywordVectorScore(arttxt, name, text, occp)
        score2 = sum(score2.values())
        
        if wt:
            score += score1*100
        if vt:
            score += score2

        res.append((name, score))
        
    res.sort(key=lambda x: x[1], reverse=True)
    return res
예제 #2
0
def getKeywordVectorScore(art_text, cand_title, cand_text, occp):
    score_dict = dict()
    title_words = cand_title.lower().split('_')
    years = (re.findall('Category: *(\d+).+births', cand_text) +
             re.findall('Category: *(\d+).+deaths', cand_text))

    for year in years:
        score_dict[year] = 0
    
    
    for word in tools.splitToWords(cand_text)[:150]:
        if word in occp:
            score_dict[word] = 0

    nyear = 0
    nword = 0
    for word in tools.splitToWords(art_text)[:150]:
        nword+=1
        if tools.isNumeric(word):
            year = fixYear(word)
            nyear += 1
            if year in score_dict and nyear < 4:
                score_dict[year] = 100
            elif nyear < 3:
                for wyear in years[:4]:
                    try:
                        iwyear = int(wyear)
                        iyear = int(year)
                        if math.fabs(iwyear-iyear) < 10:
                            score_dict[year] = 80
                    except:
                        continue
        elif nword < 20 and word.lower() in title_words:
            score_dict[word.lower()] = 80
        elif word in occp:
            if word in score_dict:
                score_dict[word] = 20
            
    score = {'years':0, 'titles':0, 'occp':0, 'other':0}
    for k in score_dict.keys():
        if k in years:
            score['years']  += score_dict[k]
        elif k in title_words:
            score['titles'] += score_dict[k]
        elif k in occp:
            score['occp']   += score_dict[k]
        else:
            score['other']  += score_dict[k]

    #print cand_title, sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    return score
예제 #3
0
def getProximityScores(arttxt, title_txts, comp):
    '''to be used to calculate a specific meassure using comp'''
    wordsa = map(lambda x: x.lower(), tools.splitToWords(arttxt))

    global NAME
    NAME = title_txts[0][0]

    res = []
    for name, text in title_txts:
        wordsb = map(lambda x: x.lower(), tools.splitToWords(text))
        #print set(wordsa) & set(wordsb)
        res.append((name, comp(wordsa, wordsb)))
    res.sort(key=lambda x: x[1], reverse=True)
    return res
예제 #4
0
def getWordDict(arts):
    d = dict()
    for a in arts:
        for w in tools.splitToWords(a["txt"].lower()):
            if w not in d:
                d[w] = set()
            d[w].add(a["id"])
    return d
예제 #5
0
def getOffsetWordDict(arts):
    d = dict()
    for a in arts:
        words = tools.splitToWords(a["name"].lower())
        for w in words:
            if w in BLACKLIST:
                continue

            if w not in d:
                d[w] = []
            d[w].append(a["id"])
    return d
예제 #6
0
def getOffsetDirectMentions(arts, wd=None, wlen=3):
    ad = dict()
    for a in arts:
        ad[a["id"]] = a

    if wd == None:
        wd = getOffsetWordDict(arts)

    artcount = 0
    for a in arts:
        words = tools.splitToWords(a["txt"].lower())
        window = words[:wlen]

        a["direct_refs"] = set()

        for w in words[wlen:] + [""]:
            matchd = dict()
            for word in list(set(window)):
                if word in wd:
                    for aid in set(wd[word]):
                        if aid not in matchd:
                            matchd[aid] = 1
                        else:
                            matchd[aid] += 1

            for m in matchd.keys():
                txtwords = filter(lambda x: x not in BLACKLIST, tools.splitToWords(ad[m]["name"]))
                namelen = len(txtwords)
                if namelen <= matchd[m] or matchd[m] == wlen:
                    a["direct_refs"].add((ad[m]["name"], ad[m]["id"]))
                    # print namelen, matchd[m], ad[m]['name']

            if len(window) > 0:
                window.pop(0)
            window.append(w)
        print "at", artcount, "from", len(arts)
        artcount += 1
예제 #7
0
def getWikiCandidates(wdict, title):
    words = tools.splitToWords(title.lower())
    first = True
    candids = set()

    for w in words:
        if w in wdict:
            if first:
                candids |= set(wdict[w])
                # first = False
            else:
                candids &= set(wdict[w])

    if len(candids) == 0:
        print "no match for", title

    return list(candids)
예제 #8
0
def getDirectMentions(arts, wd):
    ad = dict()
    for a in arts:
        ad[a["id"]] = a

    for a in arts:
        words = tools.splitToWords(a["name"].lower())

        c = 0
        match = set()
        for w in words:
            c += 1
            if w in wd:
                match = set(wd[w])
                break

        for w in words[c:]:
            if w in wd:
                match &= wd[w]

        a["direct_refs"] = []
        for m in list(match):
            a["direct_refs"].append((ad[m]["name"], ad[m]["id"]))
예제 #9
0
def getNeighbors(artd, wikititles, range=3, BING=True, WIKIBOT=True, params=PARAMS):
    #return titles from the vicinity in the range
    title = artd['name']
    
    #put last name last
    title = ' '.join(artd['name'].split(',')[::-1]).strip()
    #title = reformat(title)
    
    print title

    
    index = bisect.bisect_left(wikititles, title)
    res = set()
    
    for wt in wikititles[index-range:index+range]:
        if 'isambiguation' not in wt:
            res.add(wiki.reformat(wt))
    
    #bing it
    blim1 = params.getNumWordsQ1()
    blim2 = params.getNumWordsQ2()
    if BING:
        firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim1])
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords
        res = addBingLinks(q, res)

        #secnod time with more words
        firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim2])
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords
        res = addBingLinks(q, res)

        #print q
    
        q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)
        res = addBingLinks(q, res)

    if WIKIBOT:
        res |= set(wiki.queryInterface(title))
        res |= set(wiki.queryInterface('_'.join(
            [title] +
            tools.splitToWords(artd['txt'])[:blim1])))
        

    #print '\n\n', res, '\n\n'
    
    #wiki suggestions
    res = res | set(wiki.getSearchSuggestions(title, limit=5))

    #wiki search
    res = res | set(wiki.search(title))
    
    #print res
    #deal with disambiguation pages
    artd['missing'] = set()
    artd['disambig'] = set()
    for wt in list(res):
        if 'Talk:' in wt:
            res.add(wt.split('Talk:')[1])
            continue
        if ':' in wt or 'List_of' in wt or 'Category:' in wt:
            continue
        if wiki.isMissing(wt, rfmt=lambda x: x):
            res.remove(wt)
            if wt not in artd['missing']:
                artd['missing'].add(wt)
                res |= set(wiki.getSearchSuggestions(wt))
                print 'miss', wt
                continue
        
        dtitles = removeDisambigCandidates(wt)
        if dtitles != None:
            artd['disambig'].add(wt)
            res.remove(wt)
            res |= dtitles

    res = set(map(lambda x: re.sub(' ', '_', x), list(res)))

    for wt in list(res):
        if re.match('\d\d\d*', wt):
            res.remove(wt)
    #print res
    return list(res)
예제 #10
0
def getKeywordScore(art_text, cand_title, cand_text, occp):
    score_dict = dict()
    #links = re.findall('\[+(.+?)\]', cand_text)
    title_words = cand_title.lower().split('_')
    years = (re.findall('Category: *(\d+).+births', cand_text) +
             re.findall('Category: *(\d+).+deaths', cand_text))

    for year in years:
        score_dict[year] = 0
    
    '''while len(links) != 0:
        l = links.pop(0)
        words = tools.splitToWords(l)
        if len(words) > 1:
            links = links+words
        else:
            l = l.lower()
            score_dict[l] = 0
    '''
    
    for word in tools.splitToWords(cand_text)[:150]:
        if word in occp:
            score_dict[word] = 0

        ''' elif isCapitalized(word):
            word = word.lower()
            if word in score_dict:
                score_dict[word] = 0
        '''

    nyear = 0
    nword = 0
    for word in tools.splitToWords(art_text)[:150]:
        nword+=1
        if tools.isNumeric(word):
            year = fixYear(word)
            nyear += 1
            if year in score_dict and nyear < 4:
                score_dict[year] = 100
            elif nyear < 3:
                for wyear in years[:4]:
                    try:
                        iwyear = int(wyear)
                        iyear = int(year)
                        if math.fabs(iwyear-iyear) < 10:
                            score_dict[year] = 80
                    except:
                        continue
        elif nword < 20 and word.lower() in title_words:
            score_dict[word.lower()] = 80
        elif word in occp:
            if word in score_dict:
                score_dict[word] = 20
        '''elif isCapitalized(word):
            word = word.lower()
            if word in score_dict:
                score_dict[word] = 10
        else:
            word = word.lower()
            if word in score_dict:
                score_dict[word] = 15'''
            
    score = 0
    for k in score_dict.keys():
        score += score_dict[k]
    #print cand_title, sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    return score