def cate_smooth(twt_lst, ratio, sel, lmd): """Smoothing the dataset by place category""" rst_lst = dataset.Dataset() pid_lst = twt_lst.distinct('place_id') twt_dist = twt_lst.groupfunc('place_id', len) tid_set = set(twt_lst.distinct('place_id')) pid_set = set(pid_lst) for pid in pid_lst: plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \ ('id = \'{0}\''.format(pid),), 'place') plc_type = plc[0]['super_category'] tmp_lst = list() cand = dataset.type_random(plc_type) for twt in cand: if twt['id'] not in tid_set and twt['place_id'] not in pid_lst: if sel(twt, plc): twt['place_id'] = pid tid_set.add(twt['id']) pid_set.add(twt['place_id']) tmp_lst.append(twt) if len(tmp_lst) >= ratio * twt_dist[pid]: break rst_lst.extend(tmp_lst) rst_lst.extend(twt_lst) return rst_lst
def cnt_map(region, table = 'sample', draw = True): """Draw a region map of tweets""" twt_lst = dataset.loadrows(GEOTWEET, ('lat', 'lng'), ('MBRContains({0}, geo)'.format(dataset.geo_rect(*region)),), table) lat = list(); lng = list(); for twt in twt_lst: lat.append(twt['lat']) lng.append(twt['lng']) if draw: x = np.array(lng) y = np.array(lat) xmin = x.min() xmax = x.max() ymin = y.min() ymax = y.max() plt.hexbin(x,y, gridsize=200, cmap=cm.jet) plt.axis([xmin, xmax, ymin, ymax]) plt.title("Hexagon binning") cb = plt.colorbar() cb.set_label('counts') plt.show() return lat, lng
def region2arff(dst, region): """Generate data in the region in arff format""" twt_lst = dataset.loadrows(GEOTWEET, ("place_id", "text"), ("MBRContains({0}, geo)".format(geo_rect(*region)),)) vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst]) bgdist = vec_lst.bgdist() # cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append("__CLASS__") for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = twt_lst[i]["place_id"] vec_lst.gen_arff(dst, keylist)
def region2arff(dst, region): """Generate data in the region in arff format""" twt_lst = dataset.loadrows( GEOTWEET, ('place_id', 'text'), ('MBRContains({0}, geo)'.format(geo_rect(*region)), )) vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt['text']))) \ for twt in twt_lst]) bgdist = vec_lst.bgdist() #cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append('__CLASS__') for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = twt_lst[i]['place_id'] vec_lst.gen_arff(dst, keylist)
def time_plot(place_id): """plot the time of each tweet in a day """ tims = list() plcs = list() idm = id2int() rows = dataset.loadrows(GEOTWEET, ('created_at',), ('place_id=\'{0}\''.format(place_id),), 'sample') for line in rows: if line['created_at'] == None: continue tim = time.strptime(str(line['created_at']), '%Y-%m-%d %H:%M:%S') tims.append(tim.tm_wday + tim.tm_hour/24.0) x = np.array(tims) plt.hist(x, 42) plt.title('Place {0}'.format(place_id)) plt.show()
def time_place_plot(user_id): """plot the time of each tweet in a day """ tims = list() plcs = list() idm = id2int() rows = dataset.loadrows(GEOTWEET, ('created_at', 'place_id'), ('user_id={0}'.format(user_id),), 'sample') for line in rows: if line['created_at'] == None: continue tim = time.strptime(str(line['created_at']), '%Y-%m-%d %H:%M:%S') plc = line['place_id'] tims.append(tim.tm_wday + tim.tm_hour/24.0) plcs.append(idm.map(plc)) x = np.array(tims) y = np.array(plcs) plt.plot(x, y, 'o') plt.title('User {0}'.format(user_id)) plt.show()
def top_poi_100_crs(dst, city, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ('id',), \ ('superior_name=\'{0}\''.format(city),), 'sample_dist_100') twt_lst = dataset.load_by_place([plc['id'] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text'])))) if twt['place_id'] not in places: place = dataset.DataItem() place['id'] = twt['place_id'] place['label'] = str(len(places)) place['name'] = twt['name'] place['category'] = twt['category'] place['super_category'] = twt['super_category'] places[twt['place_id']] = place #output the places as json objects with open(dst + '.place', 'w') as fplc: for key in places: print >> fplc, json.dumps(places[key]) #output the tweets as json objects with open(dst + '.tweet', 'w') as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) #cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append('__NO__') keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append('__CLASS__') for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = places[twt_lst[i]['place_id']][col] vec_lst[i]['__NO__'] = i vec_lst.gen_crs_arff(dst, 5, keylist)
def cate_smoothed_100(dst, city): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100") twt_lst = dataset.load_by_place([plc["id"] for plc in plst]) twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True) vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst]) bgdist = vec_lst.bgdist() # cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append("__CLASS__") for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = name_filter(dataset.place_name(twt_lst[i]["place_id"], GEOTWEET)) statistics.class_dist(vec) vec_lst.gen_arff(dst, keylist)
def top_poi_100_crs(dst, city, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100") twt_lst = dataset.load_by_place([plc["id"] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"])))) if twt["place_id"] not in places: place = dataset.DataItem() place["id"] = twt["place_id"] place["label"] = str(len(places)) place["name"] = twt["name"] place["category"] = twt["category"] place["super_category"] = twt["super_category"] places[twt["place_id"]] = place # output the places as json objects with open(dst + ".place", "w") as fplc: for key in places: print >> fplc, json.dumps(places[key]) # output the tweets as json objects with open(dst + ".tweet", "w") as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) # cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append("__NO__") keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append("__CLASS__") for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = places[twt_lst[i]["place_id"]][col] vec_lst[i]["__NO__"] = i vec_lst.gen_crs_arff(dst, 5, keylist)
def cate_smoothed_100(dst, city): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ('id',), \ ('superior_name=\'{0}\''.format(city),), 'sample_dist_100') twt_lst = dataset.load_by_place([plc['id'] for plc in plst]) twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True) vec_lst = dataset.Dataset( [line2tf(comma_filter(fourq_filter(twt['text']))) for twt in twt_lst]) bgdist = vec_lst.bgdist() #cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append('__CLASS__') for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = name_filter( dataset.place_name(twt_lst[i]['place_id'], GEOTWEET)) statistics.class_dist(vec) vec_lst.gen_arff(dst, keylist)
def all_poi_100(dst, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ("id",), None, "sample_dist_100") twt_lst = dataset.load_by_place([plc["id"] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"])))) if twt["place_id"] not in places: place = dataset.DataItem() place["id"] = twt["place_id"] place["label"] = str(len(places)) place["name"] = twt["name"] place["category"] = twt["category"] place["super_category"] = twt["super_category"] places[twt["place_id"]] = place # output the places as json objects with open(dst + ".place", "w") as fplc: for key in places: print >> fplc, json.dumps(places[key]) # output the tweets as json objects with open(dst + ".tweet", "w") as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) # cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append("__NO__") keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append("__CLASS__") # add idf divisor # idf = vec_lst.idf() # for vec in vec_lst: # for key in vec.iterkeys(): # vec[key] = vec[key]/math.log(float(idf[key])+1) for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = name_filter(places[twt_lst[i]["place_id"]][col]) vec_lst[i]["__NO__"] = i # def wdist(vec_lst): # """get the back ground distribution""" # bgdist = dict() # for vec in vec_lst: # for key in vec.iterkeys(): # if key in bgdist: # bgdist[key] += vec[key] # else: # bgdist[key] = vec[key] # return bgdist # wpdist = vec_lst.groupfunc('__CLASS__', wdist) # for key in wpdist.iterkeys(): # print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10] vec_lst.gen_arff(dst, keylist)
def all_poi_100(dst, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ('id',), \ None, 'sample_dist_100') twt_lst = dataset.load_by_place([plc['id'] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text'])))) if twt['place_id'] not in places: place = dataset.DataItem() place['id'] = twt['place_id'] place['label'] = str(len(places)) place['name'] = twt['name'] place['category'] = twt['category'] place['super_category'] = twt['super_category'] places[twt['place_id']] = place #output the places as json objects with open(dst + '.place', 'w') as fplc: for key in places: print >> fplc, json.dumps(places[key]) #output the tweets as json objects with open(dst + '.tweet', 'w') as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) #cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append('__NO__') keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append('__CLASS__') # add idf divisor #idf = vec_lst.idf() #for vec in vec_lst: #for key in vec.iterkeys(): #vec[key] = vec[key]/math.log(float(idf[key])+1) for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = name_filter( places[twt_lst[i]['place_id']][col]) vec_lst[i]['__NO__'] = i #def wdist(vec_lst): #"""get the back ground distribution""" #bgdist = dict() #for vec in vec_lst: #for key in vec.iterkeys(): #if key in bgdist: #bgdist[key] += vec[key] #else: #bgdist[key] = vec[key] #return bgdist #wpdist = vec_lst.groupfunc('__CLASS__', wdist) #for key in wpdist.iterkeys(): #print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10] vec_lst.gen_arff(dst, keylist)
tid_set = set(twt_lst.distinct('place_id')) pid_set = set(pid_lst) for pid in pid_lst: plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \ ('id = \'{0}\''.format(pid),), 'place') plc_type = plc[0]['super_category'] tmp_lst = list() cand = dataset.type_random(plc_type) for twt in cand: if twt['id'] not in tid_set and twt['place_id'] not in pid_lst: if sel(twt, plc): twt['place_id'] = pid tid_set.add(twt['id']) pid_set.add(twt['place_id']) tmp_lst.append(twt) if len(tmp_lst) >= ratio * twt_dist[pid]: break rst_lst.extend(tmp_lst) rst_lst.extend(twt_lst) return rst_lst if __name__ == '__main__': twt_lst = cate_smooth(dataset.loadrows(GEOTWEET, ('text', 'place_id'), ('place_id = \'0002ac59702e20cf\'',)), 10, lambda x, y: True) print '----------------------' for twt in twt_lst: print twt
for pid in pid_lst: plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \ ('id = \'{0}\''.format(pid),), 'place') plc_type = plc[0]['super_category'] tmp_lst = list() cand = dataset.type_random(plc_type) for twt in cand: if twt['id'] not in tid_set and twt['place_id'] not in pid_lst: if sel(twt, plc): twt['place_id'] = pid tid_set.add(twt['id']) pid_set.add(twt['place_id']) tmp_lst.append(twt) if len(tmp_lst) >= ratio * twt_dist[pid]: break rst_lst.extend(tmp_lst) rst_lst.extend(twt_lst) return rst_lst if __name__ == '__main__': twt_lst = cate_smooth( dataset.loadrows(GEOTWEET, ('text', 'place_id'), ('place_id = \'0002ac59702e20cf\'', )), 10, lambda x, y: True) print '----------------------' for twt in twt_lst: print twt
def cnt_poi(city, table='sample'): """ Draw the tweet distribution over POIs.""" twt_lst = dataset.loadrows(GEOTWEET, ('place_id', 'count(id) as cnt'), "superior_id='%s'" % (city), table, 'group by place_id')
def top_poi100_map(): plc_lst = dataset.loadrows(GEOTWEET, ('lat', 'lng', 'name'), ('superior_name=\'los angeles\'',), 'sample_dist_100') for plc in plc_lst: plc['text'] = plc['name'] geo_map('../la_100.html', plc_lst)