def online_run(self, interval=10, peroid=0.5, quiet=True): ''' return value: [(city, stat)...] ''' stats_set = set() stats = [] now = peroid cnt = 0 while now < interval: try: rsp = self.retrieve('on', quiet=quiet) cnt += 1 if rsp: for dic in rsp: if dic['id'] not in stats_set: city = ST.parse_spatial(dic) item = (city, dic['text']) stats.append(item) stats_set.add(dic['id']) except Exception as e: logging.exception(e) now += peroid time.sleep(peroid*60) linfo('online analysis %s new stats retrieved. retrieve cnt: %s' % (len(stats), cnt)) return stats
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'): st_t = time.time() city2txt = {} city_stat_cnt, total_cnt = 0, 0 stat_ids = set() txts_upperbound = 1000 with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) if dic['id'] in stat_ids: continue else: stat_ids.add(dic['id']) city = ST.parse_spatial(dic) if not city: continue city2txt.setdefault(city, list()) if len(city2txt[city]) >= txts_upperbound: continue city2txt[city].append(dic['text']) locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True) print 'city_stat_cnt', city_stat_cnt print 'total_cnt', total_cnt print 'time used: %.2f' % (time.time() - st_t) citys = sorted(city2txt.keys()) #for x in citys: # print x, len(city2txt[x]) if os.path.exists(out_path): os.system('rm %s' % out_path) for x in locs: for txt in city2txt[x]: dic={x:txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))
def parse_city_public_stats(in_path='../stats/train_public_stats', out_path='../test_data/city_test_data'): st_t = time.time() city2txt = {} city_stat_cnt, total_cnt = 0, 0 stat_ids = set() txts_upperbound = 1000 with open(in_path, 'r') as f: for line in f: total_cnt += 1 dic = json.loads(line.strip()) if dic['id'] in stat_ids: continue else: stat_ids.add(dic['id']) city = ST.parse_spatial(dic) if not city: continue city2txt.setdefault(city, list()) if len(city2txt[city]) >= txts_upperbound: continue city2txt[city].append(dic['text']) locs = sorted(city2txt.keys(), key=lambda x: len(city2txt[x]), reverse=True) print 'city_stat_cnt', city_stat_cnt print 'total_cnt', total_cnt print 'time used: %.2f' % (time.time() - st_t) citys = sorted(city2txt.keys()) #for x in citys: # print x, len(city2txt[x]) if os.path.exists(out_path): os.system('rm %s' % out_path) for x in locs: for txt in city2txt[x]: dic = {x: txt} ET.write_file(out_path, 'a', '%s\n' % json.dumps(dic))