예제 #1
0
def cate_smooth(twt_lst, ratio, sel, lmd):
    """Smoothing the dataset by place category"""
    rst_lst = dataset.Dataset()
    pid_lst = twt_lst.distinct('place_id')
    twt_dist = twt_lst.groupfunc('place_id', len)
    tid_set = set(twt_lst.distinct('place_id'))
    pid_set = set(pid_lst)

    for pid in pid_lst:
        plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \
            ('id = \'{0}\''.format(pid),), 'place')
        plc_type = plc[0]['super_category']
        tmp_lst = list()
        cand = dataset.type_random(plc_type)

        for twt in cand:
            if twt['id'] not in tid_set and twt['place_id'] not in pid_lst:
                if sel(twt, plc):
                    twt['place_id'] = pid
                    tid_set.add(twt['id'])
                    pid_set.add(twt['place_id'])
                    tmp_lst.append(twt)
                if len(tmp_lst) >= ratio * twt_dist[pid]: break
        rst_lst.extend(tmp_lst)

    rst_lst.extend(twt_lst)

    return rst_lst
예제 #2
0
def cnt_map(region, table = 'sample', draw = True):
    """Draw a region map of tweets"""
    twt_lst = dataset.loadrows(GEOTWEET, ('lat', 'lng'),
            ('MBRContains({0}, geo)'.format(dataset.geo_rect(*region)),), table)
    lat = list();
    lng = list();
    for twt in twt_lst:
        lat.append(twt['lat'])
        lng.append(twt['lng'])
    if draw:
        x = np.array(lng)
        y = np.array(lat)
        xmin = x.min()
        xmax = x.max()
        ymin = y.min()
        ymax = y.max()

        plt.hexbin(x,y, gridsize=200, cmap=cm.jet)
        plt.axis([xmin, xmax, ymin, ymax])
        plt.title("Hexagon binning")
        cb = plt.colorbar()
        cb.set_label('counts')

        plt.show()
    return lat, lng
예제 #3
0
def cate_smooth(twt_lst, ratio, sel, lmd):
    """Smoothing the dataset by place category"""
    rst_lst = dataset.Dataset()
    pid_lst = twt_lst.distinct('place_id')
    twt_dist = twt_lst.groupfunc('place_id', len)
    tid_set = set(twt_lst.distinct('place_id'))
    pid_set = set(pid_lst)

    for pid in pid_lst:
        plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \
            ('id = \'{0}\''.format(pid),), 'place')
        plc_type = plc[0]['super_category']
        tmp_lst = list()
        cand = dataset.type_random(plc_type)

        for twt in cand:
            if twt['id'] not in tid_set and twt['place_id'] not in pid_lst:
                if sel(twt, plc):
                    twt['place_id'] = pid
                    tid_set.add(twt['id'])
                    pid_set.add(twt['place_id'])
                    tmp_lst.append(twt)
                if len(tmp_lst) >= ratio * twt_dist[pid]: break
        rst_lst.extend(tmp_lst)

    rst_lst.extend(twt_lst)

    return rst_lst
예제 #4
0
파일: cate.py 프로젝트: spacelis/anatool
def region2arff(dst, region):
    """Generate data in the region in arff format"""
    twt_lst = dataset.loadrows(GEOTWEET, ("place_id", "text"), ("MBRContains({0}, geo)".format(geo_rect(*region)),))
    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst])
    bgdist = vec_lst.bgdist()
    # cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append("__CLASS__")
    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = twt_lst[i]["place_id"]
    vec_lst.gen_arff(dst, keylist)
예제 #5
0
def region2arff(dst, region):
    """Generate data in the region in arff format"""
    twt_lst = dataset.loadrows(
        GEOTWEET, ('place_id', 'text'),
        ('MBRContains({0}, geo)'.format(geo_rect(*region)), ))
    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt['text']))) \
            for twt in twt_lst])
    bgdist = vec_lst.bgdist()
    #cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append('__CLASS__')
    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = twt_lst[i]['place_id']
    vec_lst.gen_arff(dst, keylist)
예제 #6
0
def time_plot(place_id):
    """plot the time of each tweet in a day
    """
    tims = list()
    plcs = list()
    idm = id2int()
    rows = dataset.loadrows(GEOTWEET, ('created_at',), ('place_id=\'{0}\''.format(place_id),), 'sample')
    for line in rows:
        if line['created_at'] == None:
            continue
        tim = time.strptime(str(line['created_at']), '%Y-%m-%d %H:%M:%S')
        tims.append(tim.tm_wday + tim.tm_hour/24.0)
    x = np.array(tims)
    plt.hist(x, 42)
    plt.title('Place {0}'.format(place_id))
    plt.show()
예제 #7
0
def time_place_plot(user_id):
    """plot the time of each tweet in a day
    """
    tims = list()
    plcs = list()
    idm = id2int()
    rows = dataset.loadrows(GEOTWEET, ('created_at', 'place_id'), ('user_id={0}'.format(user_id),), 'sample')
    for line in rows:
        if line['created_at'] == None:
            continue
        tim = time.strptime(str(line['created_at']), '%Y-%m-%d %H:%M:%S')
        plc = line['place_id']
        tims.append(tim.tm_wday + tim.tm_hour/24.0)
        plcs.append(idm.map(plc))
    x = np.array(tims)
    y = np.array(plcs)
    plt.plot(x, y, 'o')
    plt.title('User {0}'.format(user_id))
    plt.show()
예제 #8
0
def top_poi_100_crs(dst, city, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            ('superior_name=\'{0}\''.format(city),), 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text']))))
        if twt['place_id'] not in places:
            place = dataset.DataItem()
            place['id'] = twt['place_id']
            place['label'] = str(len(places))
            place['name'] = twt['name']
            place['category'] = twt['category']
            place['super_category'] = twt['super_category']
            places[twt['place_id']] = place

    #output the places as json objects
    with open(dst + '.place', 'w') as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    #output the tweets as json objects
    with open(dst + '.tweet', 'w') as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    #cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append('__NO__')
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])

    keylist.append('__CLASS__')
    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = places[twt_lst[i]['place_id']][col]
        vec_lst[i]['__NO__'] = i
    vec_lst.gen_crs_arff(dst, 5, keylist)
예제 #9
0
파일: cate.py 프로젝트: spacelis/anatool
def cate_smoothed_100(dst, city):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True)

    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst])

    bgdist = vec_lst.bgdist()

    # cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append("__CLASS__")

    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = name_filter(dataset.place_name(twt_lst[i]["place_id"], GEOTWEET))
    statistics.class_dist(vec)
    vec_lst.gen_arff(dst, keylist)
예제 #10
0
파일: cate.py 프로젝트: spacelis/anatool
def top_poi_100_crs(dst, city, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"]))))
        if twt["place_id"] not in places:
            place = dataset.DataItem()
            place["id"] = twt["place_id"]
            place["label"] = str(len(places))
            place["name"] = twt["name"]
            place["category"] = twt["category"]
            place["super_category"] = twt["super_category"]
            places[twt["place_id"]] = place

    # output the places as json objects
    with open(dst + ".place", "w") as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    # output the tweets as json objects
    with open(dst + ".tweet", "w") as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    # cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append("__NO__")
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])

    keylist.append("__CLASS__")
    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = places[twt_lst[i]["place_id"]][col]
        vec_lst[i]["__NO__"] = i
    vec_lst.gen_crs_arff(dst, 5, keylist)
예제 #11
0
def cate_smoothed_100(dst, city):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            ('superior_name=\'{0}\''.format(city),), 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True)

    vec_lst = dataset.Dataset(
        [line2tf(comma_filter(fourq_filter(twt['text']))) for twt in twt_lst])

    bgdist = vec_lst.bgdist()

    #cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append('__CLASS__')

    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = name_filter(
            dataset.place_name(twt_lst[i]['place_id'], GEOTWEET))
    statistics.class_dist(vec)
    vec_lst.gen_arff(dst, keylist)
예제 #12
0
파일: cate.py 프로젝트: spacelis/anatool
def all_poi_100(dst, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), None, "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"]))))
        if twt["place_id"] not in places:
            place = dataset.DataItem()
            place["id"] = twt["place_id"]
            place["label"] = str(len(places))
            place["name"] = twt["name"]
            place["category"] = twt["category"]
            place["super_category"] = twt["super_category"]
            places[twt["place_id"]] = place

    # output the places as json objects
    with open(dst + ".place", "w") as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    # output the tweets as json objects
    with open(dst + ".tweet", "w") as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    # cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append("__NO__")
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])
    keylist.append("__CLASS__")

    # add idf divisor
    # idf = vec_lst.idf()
    # for vec in vec_lst:
    # for key in vec.iterkeys():
    # vec[key] = vec[key]/math.log(float(idf[key])+1)

    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = name_filter(places[twt_lst[i]["place_id"]][col])
        vec_lst[i]["__NO__"] = i

    # def wdist(vec_lst):
    # """get the back ground distribution"""
    # bgdist = dict()
    # for vec in vec_lst:
    # for key in vec.iterkeys():
    # if key in bgdist:
    # bgdist[key] += vec[key]
    # else:
    # bgdist[key] = vec[key]
    # return bgdist
    # wpdist = vec_lst.groupfunc('__CLASS__', wdist)
    # for key in wpdist.iterkeys():
    # print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10]
    vec_lst.gen_arff(dst, keylist)
예제 #13
0
def all_poi_100(dst, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            None, 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text']))))
        if twt['place_id'] not in places:
            place = dataset.DataItem()
            place['id'] = twt['place_id']
            place['label'] = str(len(places))
            place['name'] = twt['name']
            place['category'] = twt['category']
            place['super_category'] = twt['super_category']
            places[twt['place_id']] = place

    #output the places as json objects
    with open(dst + '.place', 'w') as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    #output the tweets as json objects
    with open(dst + '.tweet', 'w') as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    #cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append('__NO__')
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])
    keylist.append('__CLASS__')

    # add idf divisor
    #idf = vec_lst.idf()
    #for vec in vec_lst:
    #for key in vec.iterkeys():
    #vec[key] = vec[key]/math.log(float(idf[key])+1)

    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = name_filter(
            places[twt_lst[i]['place_id']][col])
        vec_lst[i]['__NO__'] = i

    #def wdist(vec_lst):
    #"""get the back ground distribution"""
    #bgdist = dict()
    #for vec in vec_lst:
    #for key in vec.iterkeys():
    #if key in bgdist:
    #bgdist[key] += vec[key]
    #else:
    #bgdist[key] = vec[key]
    #return bgdist
    #wpdist = vec_lst.groupfunc('__CLASS__', wdist)
    #for key in wpdist.iterkeys():
    #print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10]
    vec_lst.gen_arff(dst, keylist)
예제 #14
0
    tid_set = set(twt_lst.distinct('place_id'))
    pid_set = set(pid_lst)

    for pid in pid_lst:
        plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \
            ('id = \'{0}\''.format(pid),), 'place')
        plc_type = plc[0]['super_category']
        tmp_lst = list()
        cand = dataset.type_random(plc_type)

        for twt in cand:
            if twt['id'] not in tid_set and twt['place_id'] not in pid_lst:
                if sel(twt, plc):
                    twt['place_id'] = pid
                    tid_set.add(twt['id'])
                    pid_set.add(twt['place_id'])
                    tmp_lst.append(twt)
                if len(tmp_lst) >= ratio * twt_dist[pid]: break
        rst_lst.extend(tmp_lst)

    rst_lst.extend(twt_lst)

    return rst_lst

if __name__ == '__main__':
    twt_lst = cate_smooth(dataset.loadrows(GEOTWEET, ('text', 'place_id'), ('place_id = \'0002ac59702e20cf\'',)), 10, lambda x, y: True)
    print '----------------------'
    for twt in twt_lst:
        print twt

예제 #15
0
    for pid in pid_lst:
        plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \
            ('id = \'{0}\''.format(pid),), 'place')
        plc_type = plc[0]['super_category']
        tmp_lst = list()
        cand = dataset.type_random(plc_type)

        for twt in cand:
            if twt['id'] not in tid_set and twt['place_id'] not in pid_lst:
                if sel(twt, plc):
                    twt['place_id'] = pid
                    tid_set.add(twt['id'])
                    pid_set.add(twt['place_id'])
                    tmp_lst.append(twt)
                if len(tmp_lst) >= ratio * twt_dist[pid]: break
        rst_lst.extend(tmp_lst)

    rst_lst.extend(twt_lst)

    return rst_lst


if __name__ == '__main__':
    twt_lst = cate_smooth(
        dataset.loadrows(GEOTWEET, ('text', 'place_id'),
                         ('place_id = \'0002ac59702e20cf\'', )), 10,
        lambda x, y: True)
    print '----------------------'
    for twt in twt_lst:
        print twt
예제 #16
0
def cnt_poi(city, table='sample'):
    """ Draw the tweet distribution over POIs."""
    twt_lst = dataset.loadrows(GEOTWEET, ('place_id', 'count(id) as cnt'),
                               "superior_id='%s'" % (city), table, 'group by place_id')
예제 #17
0
def top_poi100_map():
    plc_lst = dataset.loadrows(GEOTWEET, ('lat', 'lng', 'name'), ('superior_name=\'los angeles\'',), 'sample_dist_100')
    for plc in plc_lst:
        plc['text'] = plc['name']
    geo_map('../la_100.html', plc_lst)