Exemplo n.º 1
0
def onesetup(places, numtwts, numtest, balance):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is just one tweet
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    # prepare for data
    twtmodel = dict()
    webmodel = dict()
    twttest = Dataset()
    for pid in places:
        twtp = loadrows(GEOTWEET, ('place_id', 'text'),
                ('place_id=\'{0}\''.format(pid),), 'sample',
                'order by rand() limit {0}'.format(max(numtwts) + numtest))
        webmodel[pid] = loadrows(GEOTWEET, ('place_id', 'web'),
                ('place_id=\'{0}\''.format(pid),), 'web',
                'order by rand() limit 30')['web']
        twtmodel[pid] = twtp['text'][:max(numtwts)]
        for idx in range(max(numtwts) + 1, twtp.size()):
            twttest.append(twtp.item(idx))

    # ranking by twt and twt+web
    for numtwt in numtwts:
        lmtwt = dict()
        lmweb = dict()
        for pid in twtmodel.iterkeys():
            lmtwt[pid] = lmfromtext(twtmodel[pid][:numtwt])
            lmweb[pid] = lmfromtext(webmodel[pid])
        jointranks = list()
        for item in twttest:
            jointranks.append(joint_ranking(lmfromtext([item['text'],]), lmtwt, lmweb, balance))
        twtranks = list()
        for item in twttest:
            twtranks.append(kl_ranking(lmtwt, lmfromtext([item['text'],])))
        gjoint = batcheval(twttest['place_id'], len(places), jointranks)
        gtwt = batcheval(twttest['place_id'], len(places), twtranks)
        plt.plot(gjoint['pos'], gjoint['rate'], marker='^',
                label='JOINT($t={0}$)'.format(numtwt), linestyle=lsts.next())
        plt.plot(gtwt['pos'], gtwt['rate'], marker='o',
                label='TWEET($t={0}$)'.format(numtwt), linestyle=lsts.next())

    webranks = list()
    for item in twttest:
        webranks.append(kl_ranking(lmweb, lmfromtext([item['text'],])))
    gweb = batcheval(twttest['place_id'], len(places), webranks)
    plt.plot(gweb['pos'], gweb['rate'], label='WEB', linestyle='dotted')
    plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']],
             ls='-.', marker='s',
             label='Random Baseline')
    plt.xlabel('First $n$ Places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()
Exemplo n.º 2
0
def kldiff(places):
    """ compare the difference of kl-divergence between tweets and web pages
        for each place in places
    """
    diff = Dataset()
    for pid in places:
        twt = loadrows(GEOTWEET, ('place_id', 'text'),
                ('place_id=\'{0}\''.format(pid),), 'sample',
                'order by rand() limit {0}'.format(100))
        web = loadrows(GEOTWEET, ('place_id', 'web'),
                ('place_id=\'{0}\''.format(pid),), 'web',
                'limit 25')
        lmref = lmfromtext(twt['text'][:50])
        lmtwt = lmfromtext(twt['text'][51:])
        lmweb = lmfromtext(web['web'])
        diff.append({'pid': pid, 'twtkld': kl_divergence(lmtwt, lmref),
            'webkld': kl_divergence(lmweb, lmref)})
    for item in diff:
        print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'], item['webkld'])
Exemplo n.º 3
0
def kldiff(places):
    """ compare the difference of kl-divergence between tweets and web pages
        for each place in places
    """
    diff = Dataset()
    for pid in places:
        twt = loadrows(GEOTWEET, ('place_id', 'text'),
                       ('place_id=\'{0}\''.format(pid), ), 'sample',
                       'order by rand() limit {0}'.format(100))
        web = loadrows(GEOTWEET, ('place_id', 'web'),
                       ('place_id=\'{0}\''.format(pid), ), 'web', 'limit 25')
        lmref = lmfromtext(twt['text'][:50])
        lmtwt = lmfromtext(twt['text'][51:])
        lmweb = lmfromtext(web['web'])
        diff.append({
            'pid': pid,
            'twtkld': kl_divergence(lmtwt, lmref),
            'webkld': kl_divergence(lmweb, lmref)
        })
    for item in diff:
        print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'],
                                       item['webkld'])
Exemplo n.º 4
0
def onesetup(places, numtwts, numtest, balance):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is just one tweet
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    # prepare for data
    twtmodel = dict()
    webmodel = dict()
    twttest = Dataset()
    for pid in places:
        twtp = loadrows(
            GEOTWEET, ('place_id', 'text'), ('place_id=\'{0}\''.format(pid), ),
            'sample',
            'order by rand() limit {0}'.format(max(numtwts) + numtest))
        webmodel[pid] = loadrows(GEOTWEET, ('place_id', 'web'),
                                 ('place_id=\'{0}\''.format(pid), ), 'web',
                                 'order by rand() limit 30')['web']
        twtmodel[pid] = twtp['text'][:max(numtwts)]
        for idx in range(max(numtwts) + 1, twtp.size()):
            twttest.append(twtp.item(idx))

    # ranking by twt and twt+web
    for numtwt in numtwts:
        lmtwt = dict()
        lmweb = dict()
        for pid in twtmodel.iterkeys():
            lmtwt[pid] = lmfromtext(twtmodel[pid][:numtwt])
            lmweb[pid] = lmfromtext(webmodel[pid])
        jointranks = list()
        for item in twttest:
            jointranks.append(
                joint_ranking(lmfromtext([
                    item['text'],
                ]), lmtwt, lmweb, balance))
        twtranks = list()
        for item in twttest:
            twtranks.append(kl_ranking(lmtwt, lmfromtext([
                item['text'],
            ])))
        gjoint = batcheval(twttest['place_id'], len(places), jointranks)
        gtwt = batcheval(twttest['place_id'], len(places), twtranks)
        plt.plot(gjoint['pos'],
                 gjoint['rate'],
                 marker='^',
                 label='JOINT($t={0}$)'.format(numtwt),
                 linestyle=lsts.next())
        plt.plot(gtwt['pos'],
                 gtwt['rate'],
                 marker='o',
                 label='TWEET($t={0}$)'.format(numtwt),
                 linestyle=lsts.next())

    webranks = list()
    for item in twttest:
        webranks.append(kl_ranking(lmweb, lmfromtext([
            item['text'],
        ])))
    gweb = batcheval(twttest['place_id'], len(places), webranks)
    plt.plot(gweb['pos'], gweb['rate'], label='WEB', linestyle='dotted')
    plt.plot(lmeval['pos'],
             [float(r) / max(lmeval['pos']) for r in lmeval['pos']],
             ls='-.',
             marker='s',
             label='Random Baseline')
    plt.xlabel('First $n$ Places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()